diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7545e790f82..0fe627f4e7f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,7 +159,7 @@ Maintainers reserve the right to decline review or close pull requests for any r
 
 # Code maintenance
 
-- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
   - Reviewing and merging related PRs
   - Fixing related bugs
   - Providing developer guidance/support
diff --git a/common/arg.cpp b/common/arg.cpp
index 05f4a5244e7..0260d79fef0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2399,7 +2399,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.fit_params = false;
             } else {
                 throw std::runtime_error(
-                    string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
+                    string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
             }
         }
     ).set_env("LLAMA_ARG_FIT"));
diff --git a/common/common.h b/common/common.h
index c5a80375713..ae32d5053c5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -869,7 +869,7 @@ std::string common_detokenize(
 // Embedding utils
 //
 
-// TODO: repace embd_norm with an enum
+// TODO: replace embd_norm with an enum
 void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
diff --git a/common/debug.h b/common/debug.h
index 0c559632586..e563b40d68f 100644
--- a/common/debug.h
+++ b/common/debug.h
@@ -18,7 +18,7 @@ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml
 // prints tensors that are processed in the computation graph
 // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
 // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determins whether an error should be thrown whenever a NaN is encountered
+// The template parameter determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
 template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
diff --git a/common/jinja/README.md b/common/jinja/README.md
index 7059105ee39..8291240767e 100644
--- a/common/jinja/README.md
+++ b/common/jinja/README.md
@@ -63,7 +63,7 @@ The llama.cpp Jinja engine introduces `jinja::string` (see `jinja/string.h`), wh
   - **One-to-many** (e.g., split): result is marked `is_input` **only if ALL** input parts are marked `is_input`
   - **Many-to-one** (e.g., join): same as one-to-many
 
-For string concatenation, string parts will be appended to the new string as-is, while perserving the `is_input` flag.
+For string concatenation, string parts will be appended to the new string as-is, while preserving the `is_input` flag.
 
 **Enabling Input Marking:**
 
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 09544173981..a6d259a6404 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4031,7 +4031,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 # split Conv3D into Conv2Ds
                 c1, c2, kt, kh, kw = data_torch.shape
                 del c1, c2, kh, kw  # unused
-                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+                assert kt == 2, "Current implementation only support temporal_patch_size of 2"
                 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
                 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
             else:
@@ -5404,7 +5404,7 @@ def set_gguf_parameters(self):
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.hparams["linear_attn_config"]
         # n_head == 0 for KDA layers, n_head > 0 for MLA layers
-        # full_attention_layers list will be used to distingush layer type
+        # full_attention_layers list will be used to distinguish layer type
         _num_kv_heads = list()
         _full_attn_layers = linear_attn_config["full_attn_layers"]
         for il in range(self.hparams["num_hidden_layers"]):
@@ -6505,7 +6505,7 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
-        # default values below are taken from HF tranformers code
+        # default values below are taken from HF transformers code
         self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
         self.gguf_writer.add_vision_use_gelu(True)
         # calculate proj_scale_factor (used by tinygemma3 test model)
@@ -7097,7 +7097,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
             if bid == 0 and "time_mix_a" in new_name:
                 # dummy v0/v1/v2 on first layer
-                # easist way to make llama happy
+                # easiest way to make llama happy
                 yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
 
             yield (new_name, data_torch)
@@ -9596,7 +9596,7 @@ def __init__(self, *args, **kwargs):
         # NOTE: Explicitly include hparam prefix prefix for d_model to
         #   disambiguate with top-level head_dim
         # NOTE 2: If needed for future models, this can be isolated in a method
-        #   to separate the prefix setting and teh keys used
+        #   to separate the prefix setting and the keys used
         self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
         self.n_group = self.find_hparam(["n_groups", "num_groups"])
         self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
@@ -9743,7 +9743,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_value_length(self.head_dim)
 
         # Set feed_forward_length
-        # NOTE: This will trigger an override warning. This is preferrable to
+        # NOTE: This will trigger an override warning. This is preferable to
         #   duplicating all the parent logic
         if not self.is_moe:
             n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index 23b6a627634..51adaaf95f5 100755
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -20,7 +20,7 @@
 
 **Llama.cpp + CANN**
 
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are integrated to CANN Toolkit and kernels to using Ascend NPU directly.
 
 ## News
 
@@ -210,7 +210,7 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager
     # and install driver.
     sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
     ```
-    If the following messaage appers, firmware is installed successfully.
+    If the following message appears, firmware is installed successfully.
     ```sh
     Firmware package installed successfully!
     ```
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 07c68be5cbd..dd4c66dbe95 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -708,7 +708,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 
   - Remove **build** folder or try a clean-build.
 
-- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
+- I can **not** see `[ext_oneapi_level_zero:gpu]` after installing the GPU driver on Linux.
 
   Please double-check with `sudo sycl-ls`.
 
diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md
index 2c3f88e91a2..0783555ce8a 100644
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -116,7 +116,7 @@ Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920
 ### Windows
 
 All artifacts are already installed in the `pkg-snapdragon` folder.
-To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
+To run, adapt below instructions to use Powershell scripts in `scripts/snapdragon/windows`.
 
 ## How to Run
 
diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md
index e9346ccadf1..6307e1b69f1 100644
--- a/docs/backend/snapdragon/windows.md
+++ b/docs/backend/snapdragon/windows.md
@@ -144,7 +144,7 @@ Once the build is complete HTP ops libraries will be installed like this
 -a----         1/22/2026   6:01 PM           4139 libggml-htp.cat
 ```
 
-The .cat file, the signature and proper certicate installation can be verified with
+The .cat file, the signature and proper certificate installation can be verified with
 
 ```
 > signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
diff --git a/docs/build.md b/docs/build.md
index e6f572c77f3..772731f6418 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -595,7 +595,7 @@ You can verify that KleidiAI is being used by running
 ```bash
 ./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
 ```
-If KleidiAI is enabled, the ouput will contain a line similar to:
+If KleidiAI is enabled, the output will contain a line similar to:
 ```
 load_tensors: CPU_KLEIDIAI model buffer size =  3474.00 MiB
 ```
@@ -699,7 +699,7 @@ To read documentation for how to build on Android, [click here](./android.md)
 
 ## WebGPU [In Progress]
 
-The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.
+The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.
 
 In the llama.cpp directory, build with CMake:
 
diff --git a/docs/multimodal/MobileVLM.md b/docs/multimodal/MobileVLM.md
index 3bfab9f3d22..6c17dbf902e 100644
--- a/docs/multimodal/MobileVLM.md
+++ b/docs/multimodal/MobileVLM.md
@@ -281,7 +281,7 @@ llama_print_timings:       total time =    5990.25 ms /   202 tokens
 
 Just the same as above.
 
-**ouput**
+**output**
 ```sh
 encode_image_with_clip: image embedding created: 144 tokens
 
@@ -305,7 +305,7 @@ llama_print_timings:       total time =   15513.95 ms /   412 tokens
 ## Run on Intel(R) Core(TM) Ultra7 115H
 ### operation system
 Windows11
-### comiple
+### compile
 ```sh
 make -j32
 ```
diff --git a/examples/debug/README.md b/examples/debug/README.md
index 28e00c93427..2ea716eb543 100644
--- a/examples/debug/README.md
+++ b/examples/debug/README.md
@@ -2,7 +2,7 @@
 
 This is a utility intended to help debug a model by registering a callback that
 logs GGML operations and tensor data. It can also store the generated logits or
-embeddings as well as the prompt and token ids for comparision with the original
+embeddings as well as the prompt and token ids for comparison with the original
 model.
 
 ### Usage
diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md
index f71d2413193..b3942002147 100644
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -43,12 +43,12 @@ Choose one of the following scheduling methods:
 - `-b`: Batch size
 
 ### Examples
-#### Dream architechture:
+#### Dream architecture:
 ```
 llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
 ```
 
-#### LLaDA architechture:
+#### LLaDA architecture:
 ```
 llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
 ```
diff --git a/examples/llama.vim b/examples/llama.vim
index 736802d3655..23a281fc333 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -52,8 +52,8 @@ highlight llama_hl_info guifg=#77ff2f ctermfg=119
 "   n_prefix:         number of lines before the cursor location to include in the local prefix
 "   n_suffix:         number of lines after  the cursor location to include in the local suffix
 "   n_predict:        max number of tokens to predict
-"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
-"   t_max_predict_ms: max alloted time for the prediction
+"   t_max_prompt_ms:  max allotted time for the prompt processing (TODO: not yet supported)
+"   t_max_predict_ms: max allotted time for the prediction
 "   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
 "   auto_fim:         trigger FIM completion automatically on cursor movement
 "   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md
index 637870a5c15..c43e642fee7 100644
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@@ -69,7 +69,7 @@ Command line arguments take precedence over environment variables when both are
 
 In cases where the transformer implementation for the model has not been released
 yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
-will then cause the transformer implementation to be loaded explicitely and not
+will then cause the transformer implementation to be loaded explicitly and not
 use AutoModelForCausalLM:
 ```
 export UNRELEASED_MODEL_NAME=SomeNewModel
@@ -120,7 +120,7 @@ The converted model can be inspected using the following command:
 (venv) $ make causal-run-converted-model
 ```
 
-### Model logits verfication
+### Model logits verification
 The following target will run the original model and the converted model and
 compare the logits:
 ```console
@@ -235,7 +235,7 @@ new model the model can be converted to GGUF format using the following command:
 (venv) $ make embedding-run-converted-model
 ```
 
-### Model logits verfication
+### Model logits verification
 The following target will run the original model and the converted model (which
 was done manually in the previous steps) and compare the logits:
 ```console
@@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO
 
 ## HuggingFace utilities
 The following targets are useful for creating collections and model repositories
-on Hugging Face in the the ggml-org. These can be used when preparing a relase
+on Hugging Face in the the ggml-org. These can be used when preparing a release
 to script the process for new model releases.
 
 For the following targets a `HF_TOKEN` environment variable is required.
@@ -347,7 +347,7 @@ For the following targets a `HF_TOKEN` environment variable is required.
 > $ unset HF_TOKEN
 
 ### Create a new Hugging Face Model (model repository)
-This will create a new model repsository on Hugging Face with the specified
+This will create a new model repository on Hugging Face with the specified
 model name.
 ```console
 (venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
index 8819d87f56e..29143dd6176 100644
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -6,11 +6,11 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
 
 |Tool Name| Function|Status|
 |-|-|-|
-|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, etc.|Support|
 
 ### llama-ls-sycl-device
 
-List all SYCL devices with ID, compute capability, max work group size, ect.
+List all SYCL devices with ID, compute capability, max work group size, etc.
 
 1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
 
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index a9d1778641e..9fd3f7f32a0 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -259,7 +259,7 @@ extern "C" {
       Example usage:
 
         // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
-        // preferrably to run on the same backend as the buffer
+        // preferably to run on the same backend as the buffer
         ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
 
         sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
index 4703a05afe1..1c2ed79b774 100644
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -138,7 +138,7 @@ extern "C" {
     GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
     GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
 
-    // set gradients to zero, initilize loss, and optionally reset the optimizer
+    // set gradients to zero, initialize loss, and optionally reset the optimizer
     GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
 
     GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index fcc51f1f71a..784d69206b4 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2575,7 +2575,7 @@ extern "C" {
         struct ggml_tensor *  grad,
         struct ggml_tensor *  sgd_params); // alpha, weight decay
 
-    // build forward mutiple tensors and select one of them for computing
+    // build forward multiple tensors and select one of them for computing
     // this is useful for creating graphs that have constant topology but compute different things based on the input
     // ref: https://github.com/ggml-org/llama.cpp/pull/18550
     //
diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp
index b5aca76633c..93a6d397f79 100644
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -195,7 +195,7 @@ struct tile_config_t{
 // will be needed.
 //
 // Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
-// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
+// and the single batch gemm (m=1) has a special fast path with `avx512-vnni`.
 //
 // ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
 //    advanced-matrix-extensions-intrinsics-functions.html
@@ -1379,8 +1379,8 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
         // sum of offsets, shared across COLS
         //
         // avx512-vnni does not have `_mm512_dpbssd_epi32`,
-        // need to transfrom ss to us:
-        //   a * (b - 8) is equavilent to b * a - 8 * a
+        // need to transform ss to us:
+        //   a * (b - 8) is equivalent to b * a - 8 * a
         //   s    u   u                   u   s   u   s
         //
         __m512i vcomp;
diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c
index b390ab61c78..a707d63985e 100644
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -968,7 +968,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     const int vector_length = ggml_cpu_get_sve_cnt()*8;
 
-    //VLA Implemenation for SVE
+    //VLA Implementation for SVE
     switch (vector_length) {
         case 128:
             {
diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp
index 3eed0105bf1..80ff5ce549b 100644
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -781,7 +781,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,
 
                 const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
 
-                // Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
+                // Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns
                 // but still need the qs to use the low and hi bits from q4
                 const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
                 int8x16_t      q8_qs[8];
@@ -3796,7 +3796,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int                        n,
 
                 for (int b = 0; b < nb; b++) {
                     // bsums pairs belongs to the same q8_k subblock
-                    // 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
+                    // 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
                     const int16x8_t bsums[4]{
                         vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
                         vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp
index bd6906c4159..33c6cb65098 100644
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
@@ -423,7 +423,7 @@ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
             quants_interleaved[j] = i0;
         }
 
-        // Masks to shuffle the quants of corresonding sub blocks for rearraning quants for vectorized bsums computation
+        // Masks to shuffle the quants of corresponding sub blocks for rearranging quants for vectorized bsums computation
         __m256i shuffle_mask_sb2 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 0, 1, 4, 5, 6, 7, 8, 9, 8, 9, 12, 13, 14, 15));
         shuffle_mask_sb2 = _mm256_permute2f128_si256(shuffle_mask_sb2, shuffle_mask_sb2, 0);
         __m256i shuffle_mask_sb3 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 8, 9, 14, 15));
@@ -625,7 +625,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                 iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170));
                 iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255));
 
-                // Accumulated values multipled with appropriate scales
+                // Accumulated values multiplied with appropriate scales
                 acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
             }
 
@@ -868,7 +868,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                     const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
                     const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
 
-                    // Multiply with appropiate scales and accumulate
+                    // Multiply with appropriate scales and accumulate
                     acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
                     acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
                     acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -1076,7 +1076,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                 const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
                 const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
 
-                // Multiply with appropiate scales and accumulate
+                // Multiply with appropriate scales and accumulate
                 acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
                 acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
                 acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -1257,7 +1257,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                     // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                     const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
 
-                    // Multiply with appropiate scales and accumulate
+                    // Multiply with appropriate scales and accumulate
                     acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                     acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                     acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -1428,7 +1428,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                 // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                 const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
 
-                // Multiply with appropiate scales and accumulate
+                // Multiply with appropriate scales and accumulate
                 acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                 acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                 acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -1612,7 +1612,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     lhs_vec_11 = _mm256_permute2f128_si256(lhs_vec_11, lhs_vec_11, 0);
 
                     // Dot product done within 32 bit lanes and accumulated in the same vector
-                    // First done for first sub block and thenn for second sub block in each sb
+                    // First done for first sub block and then for second sub block in each sb
                     // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
                     // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
                     // ...........................................................................
@@ -2422,7 +2422,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                         const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                         const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
 
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                         acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                         acc_rows[rp * 4  + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                         acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -2785,7 +2785,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                     const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
 
-                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
                     acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                     acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                     acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -2802,7 +2802,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
                 }
             }
-            // Store accumlated values
+            // Store accumulated values
             for (int i = 0; i < 4; i++) {
                 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
             }
@@ -3130,7 +3130,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                         const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
                         const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);//GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
 
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                         acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                         acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                         acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -3460,7 +3460,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
                     const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); //GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
 
-                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
                     acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                     acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                     acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -4268,7 +4268,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                         const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                         const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
 
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                         acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                         acc_rows[rp * 4  + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                         acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -5035,7 +5035,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
                 }
             }
-            // Store accumlated values
+            // Store accumulated values
             for (int i = 0; i < 4; i++) {
                 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
             }
@@ -5677,7 +5677,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                         const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
                         const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
 
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                         acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                         acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                         acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -6349,7 +6349,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
                     const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
 
-                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
                     acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                     acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                     acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 64eb01a4e18..7c4026fac4e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2477,7 +2477,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
 
     if (prio != GGML_SCHED_PRIO_LOW) {
         // Tell Windows that this thread should not be throttled (needs its own CPU core).
-        // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
+        // Newer Windows 11 versions aggressively park (offline) CPU cores and often place
         // all our threads onto the first 4 cores which results in terrible performance with
         // n_threads > 4
         #if _WIN32_WINNT >= 0x0602
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index da412fd009b..5fd452a03d2 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -533,7 +533,7 @@ class tinyBLAS {
         if constexpr (RN > 1) {
             return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
         } else {
-            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+            GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
             GGML_ASSERT(false); // we have miss something.
         }
     }
@@ -711,7 +711,7 @@ class tinyBLAS_RVV {
         if constexpr (RN > 1) {
             return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
         } else {
-            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+            GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
             GGML_ASSERT(false); // we have miss something.
         }
     }
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index b7a70e06f1d..ca1b3059b8c 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -375,7 +375,7 @@ static void ggml_compute_forward_dup_bytes(
         const size_t rs = ne00 * type_size;
 
         if (nb00 == type_size) {
-            // src0 is contigous on first dimension, copy by rows
+            // src0 is contiguous on first dimension, copy by rows
             for (int64_t i03 = 0; i03 < ne03; i03++) {
                 for (int64_t i02 = 0; i02 < ne02; i02++) {
                     id += rs * ir0;
@@ -1795,7 +1795,7 @@ void ggml_compute_forward_repeat(
             {
                 ggml_compute_forward_repeat_f32(params, dst);
             } break;
-        // TODO: templateify the implemenation and support for I64
+        // TODO: templateify the implementation and support for I64
         //       ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
         //case GGML_TYPE_I64:
         //    {
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 5edba4212f6..02c3cc3119b 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -3032,7 +3032,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             case GGML_OP_MUL_MAT_ID:
                 {
                     size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
-                    size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
+                    size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
 
                     const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
                     const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
@@ -3297,7 +3297,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         auto * wdata          = (char *)params->wdata;
         auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
 
-        // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
+        // total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
         auto * matrix_row_counts = (int64_t *) (wdata_src1_end);                                        // [n_as]
         struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
 
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index beb7e32e4fc..fff70c8eb89 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1215,7 +1215,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
     }
 
     // If attention sinks are used, potentially re-scale if KQ_max is small.
-    // Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
+    // Also add the sink as a value to KQ_rowsum, this is done after synchronization of KQ_rowsum
     //     so it's being done unconditionally for every thread.
     if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
         float KQ_max_scale[cols_per_thread];
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
index 3f4a78cc6e5..7cbe32633e5 100644
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
     return 128;
 }
 
-// Currenlty llvm with the amdgcn target does not support unrolling loops
+// Currently llvm with the amdgcn target does not support unrolling loops
 // that contain a break that can not be resolved at compile time.
 #ifdef __clang__
 #pragma clang diagnostic push
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
index cd3bfd4051a..aaf711a618c 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@@ -18,7 +18,7 @@
 #if defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
 #define GGML_USE_WMMA_FATTN
 #elif defined(RDNA4)
-#warning "rocwmma fattn is not suported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
+#warning "rocwmma fattn is not supported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
 #endif // defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
 #endif // defined(GGML_HIP_ROCWMMA_FATTN)
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 7e6d3303549..b56e3d50f58 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3330,7 +3330,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
             return false;
         }
 
-        //rms_norm kernel assumes contigous rows
+        //rms_norm kernel assumes contiguous rows
         if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
             return false;
         }
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index a8c68e44b16..4300ffc148c 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -235,7 +235,7 @@ static __global__ void quantize_mmq_q8_1(
     q.z = roundf(xi.z*d_inv);
     q.w = roundf(xi.w*d_inv);
 
-    // Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
+    // Write back 4 int8 values as a single 32 bit value for better memory bandwidth:
     char4 * yqs4 = (char4 *) y[ib].qs;
     yqs4[iqs/4] = q;
 
diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu
index dc06d06930e..285c0e9543a 100644
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@@ -46,7 +46,7 @@ struct soft_max_params {
 };
 
 // When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
-// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wpass-failed"
diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu
index 177ffc268f1..07ca33f513b 100644
--- a/ggml/src/ggml-cuda/solve_tri.cu
+++ b/ggml/src/ggml-cuda/solve_tri.cu
@@ -83,7 +83,7 @@ static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
 // ======================
 // When ncols_template == 0 the bounds for the loops in this function are not
 // known and can't be unrolled. As we want to keep pragma unroll for all other
-// cases we supress the clang transformation warning here.
+// cases we suppress the clang transformation warning here.
 #ifdef __clang__
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wpass-failed"
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 7a44443a8a3..3006e217796 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -139,7 +139,7 @@ struct ggml_hexagon_session {
 };
 
 void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
-    // Bump pending flag (cleared in the session::flush once we get the responce)
+    // Bump pending flag (cleared in the session::flush once we get the response)
     this->op_pending++;  // atomic inc
 
     int err = dspqueue_write(this->queue,
@@ -443,7 +443,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
 
     // Repack the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Repack the scales
         ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -503,7 +503,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
 
     // Repack the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Unpack the scales
         const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -552,7 +552,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
 
     // Init the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Unpack the scales
         x[i * 8 + 0].d = 0;
@@ -770,7 +770,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
 
     // Repack the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Repack the scales
         ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -829,7 +829,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
 
     // Repack the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Unpack the scales
         const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -878,7 +878,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
 
     // Init the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Unpack the scales
         x[i * 8 + 0].d = 0;
@@ -1120,7 +1120,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
 
     // Repack the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Repack the scales
         uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
@@ -1180,7 +1180,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
 
     // Repack the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Unpack the scales
         const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
@@ -1229,7 +1229,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
 
     // Init the scales
     // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
     for (int i = 0; i < nb; i++) {
         // Unpack the scales
         x[i * 8 + 0].e = 0;
@@ -2670,7 +2670,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
     // The main goal here is to stack the MUL_MAT ops with the same src1 input.
     // This allows use to reuse dynamically quantized src1 in VTCM.
 
-    // TODO: the current version might do incorrect reodering in cases where quantized src0
+    // TODO: the current version might do incorrect reordering in cases where quantized src0
     //       input is an output of another Op.
 
     for (int i0 = 0; i0 < n; i0++) {
diff --git a/ggml/src/ggml-hexagon/htp-drv.cpp b/ggml/src/ggml-hexagon/htp-drv.cpp
index 2530bb06d6c..4c376b5fc91 100644
--- a/ggml/src/ggml-hexagon/htp-drv.cpp
+++ b/ggml/src/ggml-hexagon/htp-drv.cpp
@@ -282,7 +282,7 @@ static std::string get_driver_path() {
     // Replace \SystemRoot with an absolute path from system ENV windir
     const std::wstring systemRootEnv = L"windir";
 
-    // Query the number of wide charactors this variable requires
+    // Query the number of wide characters this variable requires
     DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
     if (numWords == 0) {
         GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
index 74c777d4c3e..6dc978dd68a 100644
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -10,6 +10,7 @@
 
 #include "hex-dma.h"
 #include "hvx-utils.h"
+#include "hvx-dump.h"
 
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
@@ -17,6 +18,16 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
+// Must be multiple of 32
+#define FLASH_ATTN_BLOCK_SIZE (32 * 2)
+
+// This is a bit of a hack because the compiler is strugling to properly inline
+// the default hvx_vec_f32_to_f16 with output into the local array.
+static void __attribute__((noinline)) hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1)
+{
+    *(HVX_Vector *) ptr = hvx_vec_f32_to_f16(v0, v1);
+}
+
 // Dot product of two F16 vectors, accumulating to float
 static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict x, const void * restrict y, unsigned int n, float s) {
     const HVX_Vector * restrict vx = (const HVX_Vector * restrict) x; // fp16
@@ -25,175 +36,184 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
     uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
     uint32_t nloe = n % VLEN_FP16; // leftover elements
 
-    HVX_Vector rsum = Q6_V_vsplat_R(0);
+    HVX_VectorPair rsum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
 
     uint32_t i = 0;
 
     #pragma unroll(4)
     for (i = 0; i < nvec; i++) {
-        HVX_Vector y_hf = vy[i];
-        HVX_Vector x_hf = vx[i];
-
-        HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
-
-        rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
+        rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, vx[i], vy[i]);
     }
 
     if (nloe) {
-        // Load x (fp16) and zero-out unused elements
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
         HVX_Vector y_hf = Q6_V_vand_QV(bmask, vy[i]);
         HVX_Vector x_hf = Q6_V_vand_QV(bmask, vx[i]);
 
-        HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
-
-        rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
+        rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf);
     }
 
-    rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
-    hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
+    HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)));
+    rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)));
+    hvx_vec_store_u(r, 4, rsum);
 }
 
-static inline void hvx_dot_f16_f16_aa_rx2(float * restrict r,
-                                          const void * restrict y,
-                                          const void * restrict x0,
-                                          const void * restrict x1,
-                                          unsigned int n,
-                                          float        s) {
-    const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0;  // fp16
-    const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1;  // fp16
-    const HVX_Vector * restrict vy  = (const HVX_Vector * restrict) y;   // fp16
-
-    uint32_t nvec = n / VLEN_FP16;  // num full fp16 hvx vectors
-    uint32_t nloe = n % VLEN_FP16;  // leftover elements
-
-    HVX_Vector rsum0 = Q6_V_vsplat_R(0);
-    HVX_Vector rsum1 = Q6_V_vsplat_R(0);
+static inline HVX_Vector hvx_dot_f16_f16_aa_rx4(const void * restrict y,
+                                                const uint8_t * restrict x,
+                                                const size_t stride_x,
+                                                const size_t nvec,
+                                                const size_t nloe) {
+    const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x;                   // fp16
+    const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) (x + stride_x);      // fp16
+    const HVX_Vector * restrict vx2 = (const HVX_Vector * restrict) (x + stride_x * 2);  // fp16
+    const HVX_Vector * restrict vx3 = (const HVX_Vector * restrict) (x + stride_x * 3);  // fp16
+    const HVX_Vector * restrict vy  = (const HVX_Vector * restrict) y;                   // fp16
+
+    HVX_VectorPair rsum0_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
+    HVX_VectorPair rsum1_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
+    HVX_VectorPair rsum2_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
+    HVX_VectorPair rsum3_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
     for (i = 0; i < nvec; i++) {
         HVX_Vector y_hf  = vy[i];
         HVX_Vector x0_hf = vx0[i];
         HVX_Vector x1_hf = vx1[i];
+        HVX_Vector x2_hf = vx2[i];
+        HVX_Vector x3_hf = vx3[i];
 
-        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
-        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
-
-        rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
-        rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
+        rsum0_p = hvx_vec_mpyacc_f32_f16(rsum0_p, x0_hf, y_hf);
+        rsum1_p = hvx_vec_mpyacc_f32_f16(rsum1_p, x1_hf, y_hf);
+        rsum2_p = hvx_vec_mpyacc_f32_f16(rsum2_p, x2_hf, y_hf);
+        rsum3_p = hvx_vec_mpyacc_f32_f16(rsum3_p, x3_hf, y_hf);
     }
 
     if (nloe) {
         // Load x (fp16) and zero-out unused elements
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
-        HVX_Vector x0_hf = Q6_V_vand_QV(bmask, vx0[i]);
-        HVX_Vector x1_hf = Q6_V_vand_QV(bmask, vx1[i]);
-        HVX_Vector y_hf  = Q6_V_vand_QV(bmask, vy[i]);
+        HVX_Vector     y_hf  = Q6_V_vand_QV(bmask, vy[i]);
+        HVX_Vector     x0_hf = Q6_V_vand_QV(bmask, vx0[i]);
+        HVX_Vector     x1_hf = Q6_V_vand_QV(bmask, vx1[i]);
+        HVX_Vector     x2_hf = Q6_V_vand_QV(bmask, vx2[i]);
+        HVX_Vector     x3_hf = Q6_V_vand_QV(bmask, vx3[i]);
+
+        rsum0_p = hvx_vec_mpyacc_f32_f16(rsum0_p, x0_hf, y_hf);
+        rsum1_p = hvx_vec_mpyacc_f32_f16(rsum1_p, x1_hf, y_hf);
+        rsum2_p = hvx_vec_mpyacc_f32_f16(rsum2_p, x2_hf, y_hf);
+        rsum3_p = hvx_vec_mpyacc_f32_f16(rsum3_p, x3_hf, y_hf);
+    }
+
+    HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)));
+    HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)));
+    HVX_Vector rsum2 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p)));
+    HVX_Vector rsum3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p)));
 
-        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
-        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
+    HVX_Vector_x4 rsum0123 = { .v = { rsum0, rsum1, rsum2, rsum3 } };
+    return hvx_vec_reduce_sum_f32x4(rsum0123);
+}
 
-        rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
-        rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
+static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
+                                                 const uint8_t * restrict x,
+                                                 const size_t stride_x,
+                                                 const size_t n,
+                                                 float        s) {
+
+    const size_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
+    const size_t nloe = n % VLEN_FP16; // leftover elements
+
+    HVX_Vector   sums;  // initialize at j = 0
+    const size_t stride_x_4 = stride_x * 4;
+    for (uint32_t j = 0; j < VLEN_FP32; j += 4) {
+        HVX_Vector     sums_x4 = hvx_dot_f16_f16_aa_rx4(y, x, stride_x, nvec, nloe);
+        HVX_VectorPred pred    = Q6_Q_vsetq_R(j * SIZEOF_FP32);
+        sums                   = Q6_V_vmux_QVV(pred, sums, sums_x4);
+        x += stride_x_4;
     }
 
-    HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
-    hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
+    sums = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), sums);
+    return Q6_Vsf_equals_Vqf32(sums);
 }
 
-// MAD: y (F32) += x (F16) * s (F32)
-static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict x, int n, float s) {
-    const HVX_Vector * restrict ptr_x = (const HVX_Vector *) x;
-    HVX_Vector * restrict ptr_y = (HVX_Vector *) y;
+// MAD: y (F32) += x (F16) * s (F16)
+static inline void hvx_mad_f32_f16_aa(float * restrict y, const void * restrict x, const __fp16 * restrict s, int n) {
+    const HVX_Vector * restrict vx0 = (const HVX_Vector *) x;
+
+    HVX_VectorPair * restrict vy_p = (HVX_VectorPair *) y;
+    HVX_Vector * restrict vy = (HVX_Vector *) y;
 
     uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
     uint32_t nloe = n % VLEN_FP16; // leftover elements
 
-    HVX_Vector S = hvx_vec_splat_f16(s);
+    HVX_Vector S0 = hvx_vec_splat_f16(*s);
 
     uint32_t i = 0;
-    #pragma unroll(4)
+
+    #pragma unroll(2)
     for (i = 0; i < nvec; ++i) {
-        // Multiply x * s -> pair of F32 vectors
-        HVX_VectorPair xs_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x[i]), S);
-        ptr_y[i*2]   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(xs_p), ptr_y[i*2]));
-        ptr_y[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(xs_p), ptr_y[i*2+1]));
+        vy_p[i] = hvx_vec_mpyacc_f32_f16(vy_p[i], Q6_Vh_vshuff_Vh(vx0[i]), S0);
     }
 
     if (nloe) {
-        HVX_VectorPair xs_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x[i]), S);
+        HVX_VectorPair xy_p = vy_p[i];
+        xy_p = hvx_vec_mpyacc_f32_f16(xy_p, Q6_Vh_vshuff_Vh(vx0[i]), S0);
 
-        HVX_Vector xs = Q6_V_lo_W(xs_p);
-        i = 2 * i; // index for ptr_y
+        HVX_Vector xy = Q6_V_lo_W(xy_p);
+        i = 2 * i;  // index for vy
 
-        if (nloe >= 32) {
-            ptr_y[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
-            nloe -= 32; ++i; xs = Q6_V_hi_W(xs_p);
+        if (nloe >= VLEN_FP32) {
+            vy[i] = xy;
+            nloe -= VLEN_FP32; ++i; xy = Q6_V_hi_W(xy_p);
         }
 
         if (nloe) {
-            HVX_Vector xy = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
-            hvx_vec_store_a(&ptr_y[i], nloe * 4, xy);
+            hvx_vec_store_a(&vy[i], nloe * 4, xy);
         }
     }
 }
 
-// MAD: y (F32) += x0 (F16) * s0 (F32) + x1 (F16) * s1 (F32)
-static inline void hvx_mad_f32_f16_aa_rx2(float * restrict y,
-                                          const void * restrict x0,
-                                          const void * restrict x1,
-                                          float s0,
-                                          float s1,
-                                          int   n) {
-    const HVX_Vector * restrict ptr_x0 = (const HVX_Vector *) x0;
-    const HVX_Vector * restrict ptr_x1 = (const HVX_Vector *) x1;
-    HVX_Vector * restrict ptr_y        = (HVX_Vector *) y;
+// MAD: y (F32) += x0 (F16) * s0 (F16) + x1 (F16) * s1 (F16)
+static inline void hvx_mad_f32_f16_aa_rx2(float * restrict y, const void * restrict x0, const void * restrict x1,
+                                          const __fp16 * restrict s0, const __fp16 * restrict s1, int n) {
+    const HVX_Vector * restrict vx0 = (const HVX_Vector *) x0;
+    const HVX_Vector * restrict vx1 = (const HVX_Vector *) x1;
+
+    HVX_VectorPair * restrict vy_p  = (HVX_VectorPair *) y;
+    HVX_Vector * restrict vy        = (HVX_Vector *) y;
 
     uint32_t nvec = n / VLEN_FP16;  // num full fp16 hvx vectors
     uint32_t nloe = n % VLEN_FP16;  // leftover elements
 
-    HVX_Vector S0 = hvx_vec_splat_f16(s0);
-    HVX_Vector S1 = hvx_vec_splat_f16(s1);
+    HVX_Vector S0 = hvx_vec_splat_f16(*s0);
+    HVX_Vector S1 = hvx_vec_splat_f16(*s1);
 
     uint32_t i = 0;
+
     #pragma unroll(2)
     for (i = 0; i < nvec; ++i) {
-        // Multiply x * s -> pair of F32 vectors
-        HVX_VectorPair xs0_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x0[i]), S0);
-        HVX_VectorPair xs1_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x1[i]), S1);
-
-        HVX_Vector xs_p_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xs0_p), Q6_V_lo_W(xs1_p));
-        HVX_Vector xs_p_hi = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(xs0_p), Q6_V_hi_W(xs1_p));
-
-        ptr_y[i * 2]     = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs_p_lo, ptr_y[i * 2]));
-        ptr_y[i * 2 + 1] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs_p_hi, ptr_y[i * 2 + 1]));
+        vy_p[i] = hvx_vec_mpyacc_f32_f16(vy_p[i], Q6_Vh_vshuff_Vh(vx0[i]), S0);
+        vy_p[i] = hvx_vec_mpyacc_f32_f16(vy_p[i], Q6_Vh_vshuff_Vh(vx1[i]), S1);
     }
 
     if (nloe) {
-        HVX_VectorPair xs0_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x0[i]), S0);
-        HVX_VectorPair xs1_p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(ptr_x1[i]), S1);
+        HVX_VectorPair xy_p = vy_p[i];
+        xy_p = hvx_vec_mpyacc_f32_f16(xy_p, Q6_Vh_vshuff_Vh(vx0[i]), S0);
+        xy_p = hvx_vec_mpyacc_f32_f16(xy_p, Q6_Vh_vshuff_Vh(vx1[i]), S1);
 
-        HVX_Vector xs_p_lo = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xs0_p), Q6_V_lo_W(xs1_p));
-        HVX_Vector xs      = xs_p_lo;
-        i = 2 * i;  // index for ptr_y
+        HVX_Vector xy = Q6_V_lo_W(xy_p);
+        i = 2 * i;  // index for vy
 
-        if (nloe >= 32) {
-            ptr_y[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
-            nloe -= 32; ++i;
-            xs = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_hi_W(xs0_p), Q6_V_hi_W(xs1_p));
+        if (nloe >= VLEN_FP32) {
+            vy[i] = xy;
+            nloe -= VLEN_FP32; ++i; xy = Q6_V_hi_W(xy_p);
         }
 
         if (nloe) {
-            HVX_Vector xy = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(xs, ptr_y[i]));
-            hvx_vec_store_a(&ptr_y[i], nloe * 4, xy);
+            hvx_vec_store_a(&vy[i], nloe * 4, xy);
         }
     }
 }
 
-#define FLASH_ATTN_BLOCK_SIZE 128
-
 struct htp_fa_context {
     const struct htp_ops_context * octx;
 
@@ -226,7 +246,12 @@ struct htp_fa_context {
     size_t size_v_block;
     size_t size_m_block;
 
+    uint32_t qrows;
+    uint32_t qrows_per_thread;
+
     bool is_q_fp32;
+
+    uint64_t t_start;
 };
 
 static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, HVX_Vector vs) {
@@ -296,9 +321,8 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
     const uint32_t nb3 = dst->nb[3];
 
     // total rows in q
-    const uint32_t nr = neq1*neq2*neq3;
-
-    const uint32_t dr = (nr + nth - 1) / nth;
+    const uint32_t nr = factx->qrows;
+    const uint32_t dr = factx->qrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = MIN(ir0 + dr, nr);
 
@@ -337,15 +361,8 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
         const uint8_t * q_row_ptr = (const uint8_t *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3);
         dma_queue_push(dma, dma_make_ptr(spad_q, q_row_ptr), factx->size_q_row_padded, nbq1, size_q_row, 1);
 
-        const uint32_t h = iq2; // head index
-        const float slope = (factx->max_bias > 0.0f) ? (h < factx->n_head_log2 ? powf(factx->m0, h + 1) : powf(factx->m1, 2*(h - factx->n_head_log2) + 1)) : 1.0f;
-
-        HVX_Vector S_vec = hvx_vec_splat_f32(0.0f);
-        HVX_Vector M_vec = hvx_vec_splat_f32(-INFINITY);
-
-        // Clear accumulator
-        hvx_splat_f32_a(spad_a, 0, DV);
-        float * VKQ32 = (float *) spad_a;
+        // FARF(HIGH, "fa %u: prefetch Q: ir %u iq1 %u iq2 %u iq3 %u q_row_ptr %p size %u : usec %u", ith, ir, iq1, iq2, iq3, q_row_ptr, size_q_row,
+        //                 (unsigned)HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - factx->t_start));
 
         const __fp16 * mp_base = NULL;
         if (mask) {
@@ -376,8 +393,23 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
                 // Mask is 1D contiguous for this row
                 dma_queue_push(dma, dma_make_ptr(m_dst, m_src), current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
             }
+
+            // FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
+            //             ith, ir, ib, iq1, iq2, iq3,
+            //             size_k_row, size_v_row, current_block_size,
+            //             (unsigned)HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - factx->t_start));
         }
 
+        const uint32_t h = iq2; // head index
+        const float slope = (factx->max_bias > 0.0f) ? (h < factx->n_head_log2 ? powf(factx->m0, h + 1) : powf(factx->m1, 2*(h - factx->n_head_log2) + 1)) : 1.0f;
+
+        HVX_Vector S_vec = hvx_vec_splat_f32(0.0f);
+        HVX_Vector M_vec = hvx_vec_splat_f32(-INFINITY);
+
+        // Clear accumulator
+        hvx_splat_f32_a(spad_a, 0, DV);
+        float * VKQ32 = (float *) (spad_a + 0);
+
         uint8_t * q_ptr_vtcm = dma_queue_pop(dma).dst;
         if (factx->is_q_fp32) {
             hvx_copy_f16_f32_aa(q_ptr_vtcm, q_ptr_vtcm, DK);  // inplace convert f32 to f16
@@ -393,23 +425,19 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
             uint8_t * v_base = dma_queue_pop(dma).dst; // V
             __fp16  * m_base = mask ? dma_queue_pop(dma).dst : NULL; // M
 
+            // FARF(HIGH, "fa %u: process: ir %u ib %u : iq1 %u iq2 %u iq3 %u q_ptr_vtcm %p : usec %u",
+            //              ith, ir, ib, iq1, iq2, iq3, q_ptr_vtcm,
+            //             (unsigned)HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - factx->t_start));
+
             // Inner loop processing the block from VTCM
             uint32_t ic = 0;
 
-            // Process in blocks of 32 (VLEN_FP32)
-            static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 <= 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
-            HVX_Vector_x4 scores_x4;
+            // Process in sub-blocks of 32 (VLEN_FP32)
+            HVX_Vector sb_scores[FLASH_ATTN_BLOCK_SIZE / VLEN_FP32];
             HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
             for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
                 // 1. Compute scores
-                float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32];
-                for (uint32_t j = 0; j < VLEN_FP32; j += 2) {
-                    const uint32_t cur_ic = ic + j;
-                    const uint8_t * k_ptr = k_base + cur_ic * factx->size_k_row_padded;
-                    hvx_dot_f16_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + factx->size_k_row_padded, DK, factx->scale);
-                }
-
-                HVX_Vector scores = *(HVX_Vector *) scores_arr;
+                HVX_Vector scores = hvx_dot_f16_f16_aa_rx32(q_ptr_vtcm, k_base + ic * factx->size_k_row_padded, factx->size_k_row_padded, DK, factx->scale);
 
                 // 2. Softcap
                 if (factx->logit_softcap != 0.0f) {
@@ -428,35 +456,35 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
                     scores = Q6_Vsf_equals_Vqf32(scores);
                 }
 
-                scores_x4.v[iv] = scores;
+                sb_scores[iv] = scores;
                 v_max = hvx_vec_reduce_max2_f32(scores, v_max); // All lanes have block max
             }
 
             {
                 // 4. Online Softmax Update
                 HVX_Vector M_new_vec = Q6_Vsf_vmax_VsfVsf(v_max, M_vec);
-                HVX_Vector diff_vec  = Q6_Vqf32_vsub_VsfVsf(M_vec, M_new_vec);
-                HVX_Vector ms_vec    = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(diff_vec));
+                HVX_Vector diff_vec  = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(M_vec, M_new_vec));
+                HVX_Vector ms_vec    = hvx_vec_exp_f32(diff_vec);
                 M_vec = M_new_vec;
 
                 hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
 
                 HVX_Vector p_sum_vec = hvx_vec_splat_f32(0.0f);
                 for (uint32_t ic2 = 0, iv = 0; ic2 + VLEN_FP32 <= current_block_size; ic2 += VLEN_FP32, ++iv) {
-                    HVX_Vector scores = scores_x4.v[iv];
+                    HVX_Vector scores = sb_scores[iv];
                     HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_vec);
                     HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted));
 
                     p_sum_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(p_sum_vec, P));
 
                     // 5. Accumulate V
-                    float __attribute__((aligned(VLEN))) p_arr[VLEN_FP32];
-                    *(HVX_Vector *) p_arr = P;
+                    __fp16 __attribute__((aligned(VLEN))) p_arr[VLEN_FP16];
+                    hvx_vec_f32_to_f16_a(p_arr, P, hvx_vec_splat_f32(0));
 
                     for (uint32_t j = 0; j < VLEN_FP32; j += 2) {
                         const uint32_t  cur_ic = ic2 + j;
                         const uint8_t * v_ptr  = v_base + cur_ic * factx->size_v_row_padded;
-                        hvx_mad_f32_f16_aa_rx2(VKQ32, v_ptr, v_ptr + factx->size_v_row_padded, p_arr[j], p_arr[j + 1], DV);
+                        hvx_mad_f32_f16_aa_rx2(VKQ32, v_ptr, v_ptr + factx->size_v_row_padded, (p_arr + j), (p_arr + j + 1), DV);
                     }
                 }
 
@@ -464,47 +492,50 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
                 S_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(S_vec, ms_vec)), p_sum_vec));
             }
 
-            // Sync scalars for leftover/next block if needed
-            float M = hvx_vec_get_f32(M_vec);
-            float S = hvx_vec_get_f32(S_vec);
+            if (ic < current_block_size) {
+                // Sync scalars for leftover/next block if needed
+                float M = hvx_vec_get_f32(M_vec);
+                float S = hvx_vec_get_f32(S_vec);
+
+                // Leftover
+                for (; ic < current_block_size; ++ic) {
+                    float s_val;
+                    const uint8_t * k_ptr = k_base + ic * factx->size_k_row_padded;
+                    hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, factx->scale);
+                    if (factx->logit_softcap != 0.0f) {
+                        s_val = factx->logit_softcap * tanhf(s_val);
+                    }
 
-            // Leftover
-            for (; ic < current_block_size; ++ic) {
-                float s_val;
-                const uint8_t * k_ptr = k_base + ic * factx->size_k_row_padded;
-                hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, factx->scale);
-                if (factx->logit_softcap != 0.0f) {
-                    s_val = factx->logit_softcap * tanhf(s_val);
-                }
+                    if (mask) {
+                        const float m_val = m_base[ic];
+                        s_val += slope * m_val;
+                    }
 
-                if (mask) {
-                    const float m_val = m_base[ic];
-                    s_val += slope * m_val;
-                }
+                    const float Mold = M;
+                    __fp16 vs = 1.0f;
+
+                    if (s_val > M) {
+                        M = s_val;
+                        HVX_Vector diff_vec = hvx_vec_splat_f32(Mold - M);
+                        HVX_Vector ms_vec   = hvx_vec_exp_f32(diff_vec);
+                        hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
+
+                        float ms = hvx_vec_get_f32(ms_vec);
+                        S = S * ms + vs;
+                    } else {
+                        HVX_Vector diff_vec = hvx_vec_splat_f32(s_val - M);
+                        vs = hvx_vec_get_f32(hvx_vec_exp_f32(diff_vec));
+                        S += vs;
+                    }
 
-                const float Mold = M;
-                float vs = 1.0f;
-
-                if (s_val > M) {
-                    M = s_val;
-                    HVX_Vector diff_vec = hvx_vec_splat_f32(Mold - M);
-                    HVX_Vector ms_vec   = hvx_vec_exp_f32(diff_vec);
-                    hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
-
-                    float ms = hvx_vec_get_f32(ms_vec);
-                    S = S * ms + vs;
-                } else {
-                    HVX_Vector diff_vec = hvx_vec_splat_f32(s_val - M);
-                    vs = hvx_vec_get_f32(hvx_vec_exp_f32(diff_vec));
-                    S += vs;
-                }
+                    const uint8_t * v_ptr = v_base + ic * factx->size_v_row_padded;
 
-                const uint8_t * v_ptr = v_base + ic * factx->size_v_row_padded;
+                    hvx_mad_f32_f16_aa(VKQ32, v_ptr, &vs, DV);
+                }
 
-                hvx_mad_f32_f16_aa(VKQ32, v_ptr, DV, vs);
+                M_vec = hvx_vec_splat_f32(M);
+                S_vec = hvx_vec_splat_f32(S);
             }
-            M_vec = hvx_vec_splat_f32(M);
-            S_vec = hvx_vec_splat_f32(S);
 
             // Issue DMA for next+1 block (if exists)
             if (ib + 2 < factx->n_blocks) {
@@ -525,6 +556,11 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
                     const uint8_t * m_src = (const uint8_t *) (mp_base + next_ic_start);
                     dma_queue_push(dma, dma_make_ptr(m_base, m_src), next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
                 }
+
+                // FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u : iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
+                //         ith, ir, next_ib, iq1, iq2, iq3,
+                //         size_k_row, size_v_row, next_block_size,
+                //         (unsigned)HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - factx->t_start));
             }
         }
 
@@ -586,6 +622,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
     struct htp_fa_context factx;
     factx.octx = octx;
 
+    factx.t_start = HAP_perf_get_qtimer_count();
+
     factx.src0_div21 = init_fastdiv_values(q->ne[2] * q->ne[1]);
     factx.src0_div1  = init_fastdiv_values(q->ne[1]);
 
@@ -632,6 +670,15 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
     factx.m0 = powf(2.0f, -(max_bias       ) / factx.n_head_log2);
     factx.m1 = powf(2.0f, -(max_bias / 2.0f) / factx.n_head_log2);
 
+    // total rows in q
+    const uint32_t neq0 = q->ne[0];
+    const uint32_t neq1 = q->ne[1];
+    const uint32_t neq2 = q->ne[2];
+    const uint32_t neq3 = q->ne[3];
+
+    factx.qrows = neq1*neq2*neq3;
+    factx.qrows_per_thread = (factx.qrows + octx->n_threads - 1) / octx->n_threads;
+
     size_t size_vkq_acc = hex_round_up(v->ne[0] * sizeof(float), 128); // VKQ32
 
     octx->src0_spad.size_per_thread = size_q_block * 1;
diff --git a/ggml/src/ggml-hexagon/htp/hvx-base.h b/ggml/src/ggml-hexagon/htp/hvx-base.h
index 12a1b7f1288..701637f22b2 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@@ -38,7 +38,7 @@ static inline HVX_Vector hvx_vec_splat_f32(float v) {
     return Q6_V_vsplat_R(u.i);
 }
 
-static inline HVX_Vector hvx_vec_splat_f16(float v) {
+static inline HVX_Vector hvx_vec_splat_f16(_Float16 v) {
     union { __fp16 f; uint16_t i; } u = { .f = v };
     return Q6_Vh_vsplat_R(u.i);
 }
@@ -170,4 +170,23 @@ static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
     return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
 }
 
+#if __HVX_ARCH__ < 79
+
+static inline HVX_VectorPair hvx_vec_mpyacc_f32_f16(HVX_VectorPair acc, HVX_Vector x, HVX_Vector y)
+{
+    HVX_VectorPair m = Q6_Wqf32_vmpy_VhfVhf(x, y);
+    HVX_Vector a0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(m), Q6_V_lo_W(acc)));
+    HVX_Vector a1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(m), Q6_V_hi_W(acc)));
+    return Q6_W_vcombine_VV(a1, a0);
+}
+
+#else
+
+static inline HVX_VectorPair hvx_vec_mpyacc_f32_f16(HVX_VectorPair acc, HVX_Vector x, HVX_Vector y)
+{
+    return Q6_Wsf_vmpyacc_WsfVhfVhf(acc, x, y);
+}
+
+#endif
+
 #endif /* HVX_BASE_H */
diff --git a/ggml/src/ggml-hexagon/htp/hvx-copy.h b/ggml/src/ggml-hexagon/htp/hvx-copy.h
index ae0dbed0306..851482e01b2 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-copy.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-copy.h
@@ -42,11 +42,11 @@ static inline void hvx_splat_f32_u(uint8_t * restrict dst, float v, uint32_t n)
     hvx_splat_u(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
 }
 
-static inline void hvx_splat_f16_a(uint8_t * restrict dst, float v, uint32_t n) {
+static inline void hvx_splat_f16_a(uint8_t * restrict dst, _Float16 v, uint32_t n) {
     hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
 }
 
-static inline void hvx_splat_f16_u(uint8_t * restrict dst, float v, uint32_t n) {
+static inline void hvx_splat_f16_u(uint8_t * restrict dst, _Float16 v, uint32_t n) {
     hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
 }
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.h b/ggml/src/ggml-hexagon/htp/hvx-inverse.h
index 49f3efabbcc..53db94aae2b 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.h
@@ -67,7 +67,7 @@ static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
 
     HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
 
-    // Get mantissa for 16-bit represenation
+    // Get mantissa for 16-bit representation
     HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
 
     //Compute Reciprocal Exponent
diff --git a/ggml/src/ggml-hexagon/htp/hvx-reduce.h b/ggml/src/ggml-hexagon/htp/hvx-reduce.h
index 1ca7c05d983..3c0073ef6d8 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-reduce.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-reduce.h
@@ -46,6 +46,21 @@ static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) {
 
 #if __HVX_ARCH__ > 75
 
+static inline HVX_Vector hvx_vec_reduce_sum_f32x4(HVX_Vector_x4 in) {
+    HVX_VectorPair sum_p01 = Q6_W_vshuff_VVR(in.v[1], in.v[0], 4);
+    HVX_VectorPair sum_p23 = Q6_W_vshuff_VVR(in.v[3], in.v[2], 4);
+    HVX_Vector  sum_sf01  = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sum_p01), Q6_V_hi_W(sum_p01));
+    HVX_Vector  sum_sf23  = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sum_p23), Q6_V_hi_W(sum_p23));
+
+    HVX_VectorPair sum_p0123 = Q6_W_vshuff_VVR(sum_sf23, sum_sf01, 8);
+    HVX_Vector  sum_sf       = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sum_p0123), Q6_V_hi_W(sum_p0123));
+
+    sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 2));
+    sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 4));
+    sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 8));
+    return sum_sf;
+}
+
 static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
     HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
     HVX_Vector  sum_sf  = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
@@ -72,6 +87,21 @@ static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n)
 
 #else
 
+static inline HVX_Vector hvx_vec_reduce_sum_f32x4(HVX_Vector_x4 in) {
+    HVX_VectorPair sum_p01  = Q6_W_vshuff_VVR(in.v[1], in.v[0], 4);
+    HVX_VectorPair sum_p23  = Q6_W_vshuff_VVR(in.v[3], in.v[2], 4);
+    HVX_Vector     sum_qf01 = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sum_p01), Q6_V_hi_W(sum_p01));
+    HVX_Vector     sum_qf23 = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sum_p23), Q6_V_hi_W(sum_p23));
+
+    HVX_VectorPair sum_p0123 = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(sum_qf23), Q6_Vsf_equals_Vqf32(sum_qf01), 8);
+    HVX_Vector     sum_qf    = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sum_p0123), Q6_V_hi_W(sum_p0123));
+
+    sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 2));
+    sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 4));
+    sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 8));
+    return Q6_Vsf_equals_Vqf32(sum_qf);
+}
+
 static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
     HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
     HVX_Vector  sum_qf  = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 6f6f51f01f5..9ca74aedfef 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -1234,27 +1234,24 @@ static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void *
     uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
     uint32_t nloe = n % VLEN_FP16; // leftover elements
 
-    HVX_Vector rsum = Q6_V_vsplat_R(0);
+    HVX_VectorPair rsum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
 
     uint32_t i = 0;
 
     #pragma unroll(4)
     for (i = 0; i < nvec; i++) {
-        HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x[i], y[i]);
-        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
+        rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x[i], y[i]);
     }
 
     if (nloe) {
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
         HVX_Vector x_hf = Q6_V_vand_QV(bmask, x[i]);
         HVX_Vector y_hf = Q6_V_vand_QV(bmask, y[i]);
-
-        HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
-        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
+        rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf);
     }
 
-    rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
-    hvx_vec_store_u(&s[0], 4, rsum);
+    HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)));
+    hvx_vec_store_u(s, 4, hvx_vec_reduce_sum_f32(rsum));
 }
 
 static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0,
@@ -1267,35 +1264,30 @@ static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0,
     uint32_t nvec = n / VLEN_FP16;
     uint32_t nloe = n % VLEN_FP16;
 
-    HVX_Vector rsum0 = Q6_V_vsplat_R(0);
-    HVX_Vector rsum1 = Q6_V_vsplat_R(0);
+    HVX_VectorPair rsum0_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
+    HVX_VectorPair rsum1_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
 
     uint32_t i = 0;
 
     #pragma unroll(2)
     for (i = 0; i < nvec; i++) {
         HVX_Vector y_hf = y[i];
-        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0[i], y_hf);
-        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1[i], y_hf);
-
-        rsum0 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum0, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)));
-        rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)));
+        rsum0_p = hvx_vec_mpyacc_f32_f16(rsum0_p, x0[i], y_hf);
+        rsum1_p = hvx_vec_mpyacc_f32_f16(rsum1_p, x1[i], y_hf);
     }
 
     if (nloe) {
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
+        HVX_Vector y_hf  = Q6_V_vand_QV(bmask, y[i]);
         HVX_Vector x0_hf = Q6_V_vand_QV(bmask, x0[i]);
         HVX_Vector x1_hf = Q6_V_vand_QV(bmask, x1[i]);
-        HVX_Vector y_hf  = Q6_V_vand_QV(bmask, y[i]);
-
-        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
-        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
-
-        rsum0 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum0, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)));
-        rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)));
+        rsum0_p = hvx_vec_mpyacc_f32_f16(rsum0_p, x0_hf, y_hf);
+        rsum1_p = hvx_vec_mpyacc_f32_f16(rsum1_p, x1_hf, y_hf);
     }
 
-    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(Q6_Vsf_equals_Vqf32(rsum0), Q6_Vsf_equals_Vqf32(rsum1));
+    HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)));
+    HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)));
+    HVX_Vector rsum  = hvx_vec_reduce_sum_f32x2(rsum0, rsum1);
     hvx_vec_store_u(s0, 8, rsum);
 }
 
@@ -1311,10 +1303,10 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res
     uint32_t nloe = n % VLEN_FP16;
 
     // Row sums (sf) - 4 accumulators for 2×2 tile
-    HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
-    HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
-    HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
-    HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
+    HVX_VectorPair r0_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
+    HVX_VectorPair r0_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
+    HVX_VectorPair r1_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
+    HVX_VectorPair r1_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
 
     uint32_t i = 0;
 
@@ -1326,20 +1318,10 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res
         HVX_Vector c1_hf = y1[i];
 
         // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
-        HVX_VectorPair r0_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c0_hf);
-        HVX_VectorPair r0_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c1_hf);
-        HVX_VectorPair r1_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c0_hf);
-        HVX_VectorPair r1_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c1_hf);
-
-        HVX_Vector r0_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c0_qf_p), Q6_V_hi_W(r0_c0_qf_p));
-        HVX_Vector r0_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c1_qf_p), Q6_V_hi_W(r0_c1_qf_p));
-        HVX_Vector r1_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c0_qf_p), Q6_V_hi_W(r1_c0_qf_p));
-        HVX_Vector r1_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c1_qf_p), Q6_V_hi_W(r1_c1_qf_p));
-
-        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_qf, r0_c0_sum));
-        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_qf, r0_c1_sum));
-        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_qf, r1_c0_sum));
-        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_qf, r1_c1_sum));
+        r0_c0_sum_p = hvx_vec_mpyacc_f32_f16(r0_c0_sum_p, r0_hf, c0_hf);
+        r0_c1_sum_p = hvx_vec_mpyacc_f32_f16(r0_c1_sum_p, r0_hf, c1_hf);
+        r1_c0_sum_p = hvx_vec_mpyacc_f32_f16(r1_c0_sum_p, r1_hf, c0_hf);
+        r1_c1_sum_p = hvx_vec_mpyacc_f32_f16(r1_c1_sum_p, r1_hf, c1_hf);
     }
 
     if (nloe) {
@@ -1350,23 +1332,17 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res
         HVX_Vector c0_hf = Q6_V_vand_QV(bmask, y0[i]);
         HVX_Vector c1_hf = Q6_V_vand_QV(bmask, y1[i]);
 
-        HVX_VectorPair r0_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c0_hf);
-        HVX_VectorPair r0_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r0_hf, c1_hf);
-        HVX_VectorPair r1_c0_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c0_hf);
-        HVX_VectorPair r1_c1_qf_p = Q6_Wqf32_vmpy_VhfVhf(r1_hf, c1_hf);
-
-        HVX_Vector r0_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c0_qf_p), Q6_V_hi_W(r0_c0_qf_p));
-        HVX_Vector r0_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r0_c1_qf_p), Q6_V_hi_W(r0_c1_qf_p));
-        HVX_Vector r1_c0_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c0_qf_p), Q6_V_hi_W(r1_c0_qf_p));
-        HVX_Vector r1_c1_qf = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(r1_c1_qf_p), Q6_V_hi_W(r1_c1_qf_p));
-
-        r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_qf, r0_c0_sum));
-        r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_qf, r0_c1_sum));
-        r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_qf, r1_c0_sum));
-        r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_qf, r1_c1_sum));
-
+        r0_c0_sum_p = hvx_vec_mpyacc_f32_f16(r0_c0_sum_p, r0_hf, c0_hf);
+        r0_c1_sum_p = hvx_vec_mpyacc_f32_f16(r0_c1_sum_p, r0_hf, c1_hf);
+        r1_c0_sum_p = hvx_vec_mpyacc_f32_f16(r1_c0_sum_p, r1_hf, c0_hf);
+        r1_c1_sum_p = hvx_vec_mpyacc_f32_f16(r1_c1_sum_p, r1_hf, c1_hf);
     }
 
+    HVX_Vector r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r0_c0_sum_p), Q6_V_hi_W(r0_c0_sum_p)));
+    HVX_Vector r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r0_c1_sum_p), Q6_V_hi_W(r0_c1_sum_p)));
+    HVX_Vector r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r1_c0_sum_p), Q6_V_hi_W(r1_c0_sum_p)));
+    HVX_Vector r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r1_c1_sum_p), Q6_V_hi_W(r1_c1_sum_p)));
+
     // Reduce and store results
     HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
     HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index aa6a6c9008d..9aeb80d0b8b 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -18,7 +18,7 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
-// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
+// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h
 #define HTP_ROPE_TYPE_NORMAL 0
 #define HTP_ROPE_TYPE_NEOX   2
 
diff --git a/ggml/src/ggml-hexagon/htp/worker-pool.c b/ggml/src/ggml-hexagon/htp/worker-pool.c
index 894815f46a5..172e28908eb 100644
--- a/ggml/src/ggml-hexagon/htp/worker-pool.c
+++ b/ggml/src/ggml-hexagon/htp/worker-pool.c
@@ -56,7 +56,7 @@ static void worker_pool_main(void * context) {
         unsigned int n = atomic_load(&pool->n_jobs);
         unsigned int i = atomic_fetch_add(&pool->next_job, 1);
         if (i >= n) {
-            // Spurios wakeup
+            // Spurious wakeup
             continue;
         }
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 3db7f126291..4cce414abfe 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1281,7 +1281,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
     bool use_residency_sets;
 
     // optional MTLResidencySet
-    // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
+    // note: cannot use explicitly "id<MTLResidencySet>" here because it is not available on certain OSes
     id rset;
 
     // pointers to global device
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 3d5db0b79f5..b3390352ffc 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -631,7 +631,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
     const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
 
     if (!inplace) {
-        // run a separete kernel to cpy src->dst
+        // run a separate kernel to cpy src->dst
         // not sure how to avoid this
         // TODO: make a simpler cpy_bytes kernel
 
@@ -1644,7 +1644,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
     const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
 
     if (!inplace) {
-        // run a separete kernel to cpy src->dst
+        // run a separate kernel to cpy src->dst
         // not sure how to avoid this
         // TODO: make a simpler cpy_bytes kernel
 
@@ -2005,7 +2005,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
         const int16_t r0ptg  = nypsg*nsg;         // num src0 rows per threadgroup
               int16_t r1ptg  = 4;                 // num src1 rows per threadgroup
 
-        // note: not sure how optimal are those across all different hardware. there might be someting cleverer
+        // note: not sure how optimal are those across all different hardware. there might be something cleverer
         switch (ne11) {
             case 2:
                 r1ptg = 2; break;
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 1c705362fb7..9382ce53b36 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -14,7 +14,7 @@
 #define GGML_METAL_MAX_DEVICES 16
 
 // number of Metal devices
-// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
+// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
 static int g_devices = 1;
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 6c349aa0c92..a58e641ad86 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4218,7 +4218,7 @@ kernel void kernel_im2col(
 template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
 template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
 
-// TODO: obolete -- remove
+// TODO: obsolete -- remove
 //typedef void (im2col_ext_t)(
 //        constant ggml_metal_kargs_im2col & args,
 //        device const float * x,
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 0b9a021d204..7af032ce0e1 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -313,7 +313,7 @@ struct ProfilingInfo {
     cl_ulong cmd_duration_ns;
     // The time for the kernel to complete - COMPLETE - END
     cl_ulong cmd_complete_duration_ns;
-    // Total time to finish the kernel - COMPELTE - QUEUED
+    // Total time to finish the kernel - COMPLETE - QUEUED
     cl_ulong cmd_total_duration_ns;
     // Global and local work sizes.
     size_t global_size[3];
@@ -416,7 +416,6 @@ struct ggml_backend_opencl_context {
     cl_program program_add;
     cl_program program_add_id;
     cl_program program_clamp;
-    cl_program program_cpy;
     cl_program program_cvt;
     cl_program program_diag_mask_inf;
     cl_program program_gelu;
@@ -514,7 +513,7 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
     cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
     cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
-    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
+    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
     cl_kernel kernel_mul_mat_f32_f32;
     cl_kernel kernel_mul_mat_f16_f16;
     cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -873,13 +872,14 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 #else
         const std::string kernel_src = read_file("cpy.cl");
 #endif
-        backend_ctx->program_cpy =
+        cl_program prog =
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
-        CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
-        CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
-        CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(prog, "kernel_cpy_f16_f16", &err), err));
+        CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
+        CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
         GGML_LOG_CONT(".");
     }
 
@@ -2555,7 +2555,7 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
 
     cl_platform_id platform_ids[NPLAT];
     if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
-        GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
+        GGML_LOG_ERROR("ggml_opencl: platform IDs not available.\n");
         return found_devices;
     }
 
@@ -3339,7 +3339,7 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
     CL_CHECK(clReleaseEvent(evt));
 }
 
-// Syncronizes the 'backend_ctx's device with others so that commands
+// Synchronizes the 'backend_ctx's device with others so that commands
 // enqueued to it won't start until commands in the other devices have
 // completed.
 static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
@@ -3544,9 +3544,21 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                         default:
                             return false;
                     }
+                case GGML_TYPE_I32:
+                    switch (op->type) {
+                        case GGML_TYPE_I32:
+                            return true;
+                        default:
+                            return false;
+                    }
                 default:
                     return false;
             }
+        case GGML_OP_SET: {
+            return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) &&
+                    op->type == op->src[0]->type &&
+                    op->type == op->src[1]->type;
+        }
         case GGML_OP_SCALE:
             return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
         case GGML_OP_ADD:
@@ -3985,7 +3997,7 @@ struct ggml_backend_opencl_buffer_context {
 
     // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
     // before any tensor is initialized (at the beginning of alloc_tensor_range).
-    // Hence, there is alway a buffer object in this vector. When each tensor is
+    // Hence, there is always a buffer object in this vector. When each tensor is
     // being initialized, this original buffer object will be released if both
     // flattening and small allocation are enabled, and additional buffer
     // objects will be created in init_tensor to represent flattened quantized
@@ -4120,7 +4132,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         //GGML_ASSERT(offset == 0);
 
         // We create subbuffers from the original tensor buffer for scales and
-        // quants - i.e., scales and quants are aliases into the buffer obejct
+        // quants - i.e., scales and quants are aliases into the buffer object
         // that backs the original tensor. This is a cleaner way to adapt to the
         // new memory management.
         // In the old code, we allocate new buffers for scales and quants
@@ -10782,28 +10794,13 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
     // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
     UNUSED(dst);
 
-    const int ne00 = src0 ? src0->ne[0] : 0;
-    const int ne01 = src0 ? src0->ne[1] : 0;
-    const int ne02 = src0 ? src0->ne[2] : 0;
-    const int ne03 = src0 ? src0->ne[3] : 0;
-
-    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
-    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
-    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
-    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
-
-    const int ne10 = src1 ? src1->ne[0] : 0;
-    const int ne11 = src1 ? src1->ne[1] : 0;
-    const int ne12 = src1 ? src1->ne[2] : 0;
-    const int ne13 = src1 ? src1->ne[3] : 0;
-
-    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
-    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
-    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
-    const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
+    GGML_TENSOR_LOCALS(int,      ne0, src0, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
+    GGML_TENSOR_LOCALS(int,      ne1, src1, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
 
-    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+    const enum ggml_type src0t = src0->type;
+    const enum ggml_type src1t = src1->type;
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
@@ -10840,6 +10837,15 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
                     GGML_ASSERT(false && "not implemented");
             }
             break;
+        case GGML_TYPE_I32:
+            switch (src1t) {
+                case GGML_TYPE_I32:
+                    kernel = backend_ctx->kernel_cpy_i32_i32;
+                    break;
+                default:
+                    GGML_ASSERT(false && "not implemented");
+            }
+            break;
         default:
             GGML_ASSERT(false && "not implemented");
     }
@@ -10878,6 +10884,89 @@ static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const
     UNUSED(src1);
 }
 
+static void ggml_cl_set(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32) &&
+        src1->type == src0->type && dst->type == src0->type);
+
+    GGML_TENSOR_LOCALS(int,      ne0, src0, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
+    GGML_TENSOR_LOCALS(int,      ne1, src1, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
+    GGML_TENSOR_LOCALS(int,      ne,  dst,  ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb,  dst,  nb);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    const cl_ulong pnb1    = ((const int32_t *)dst->op_params)[0];
+    const cl_ulong pnb2    = ((const int32_t *)dst->op_params)[1];
+    const cl_ulong pnb3    = ((const int32_t *)dst->op_params)[2];
+    const cl_ulong offs    = ((const int32_t *)dst->op_params)[3];
+    const bool     inplace = (bool)((const int32_t *)dst->op_params)[4];
+
+    cl_kernel kernel = nullptr;
+
+    // for inplace case, dst is a view of src0 and is updated on top of it
+    // so for non-inplace case, copy src0 to dst first
+    if (!inplace) {
+        ggml_cl_cpy(backend, src0, dst, nullptr);
+    }
+
+    // then copy src1 to dst with specified offset
+    if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        kernel = backend_ctx->kernel_cpy_f32_f32;
+    } else if (src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+        kernel = backend_ctx->kernel_cpy_i32_i32;
+    } else {
+        GGML_ASSERT(false && "not implemented");
+    }
+
+    offsetd += offs;
+    cl_ulong nb = ggml_element_size(dst);
+
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne10));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne11));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne12));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne13));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb10));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb11));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb12));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb13));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne11));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne12));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne13));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &pnb1));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &pnb2));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &pnb3));
+
+    int max_local_size = backend_ctx->get_kernel_workgroup_size(kernel);
+
+    const int nth = MIN(max_local_size, ne00);
+
+    size_t global_work_size[] = {(size_t)ne11*nth, (size_t)ne12, (size_t)ne13};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+}
+
 static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -11651,6 +11740,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             }
             func = ggml_cl_cpy;
             break;
+        case GGML_OP_SET:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_set;
+            break;
         case GGML_OP_DUP:
         case GGML_OP_CONT:
             if (!any_on_device) {
diff --git a/ggml/src/ggml-opencl/kernels/cpy.cl b/ggml/src/ggml-opencl/kernels/cpy.cl
index 9369351a60c..820aa538a34 100644
--- a/ggml/src/ggml-opencl/kernels/cpy.cl
+++ b/ggml/src/ggml-opencl/kernels/cpy.cl
@@ -182,3 +182,48 @@ kernel void kernel_cpy_f32_f32(
         dst_data[i00] = src[0];
     }
 }
+
+kernel void kernel_cpy_i32_i32(
+        global int * src0,
+        ulong offset0,
+        global int * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global int*)((global char*)src0 + offset0);
+    dst = (global int*)((global char*)dst + offsetd);
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+
+    int i3 = n / (ne2*ne1*ne0);
+    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
+    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
+    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
+
+    global int * dst_data = (global int *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
+        global const int * src = (global int *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+
+        dst_data[i00] = src[0];
+    }
+}
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 519638fd416..04c9e1d7864 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -76,10 +76,10 @@ extern int g_ggml_sycl_prioritize_dmmv;
 
 
 #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
-#define VER_4VEC 610 // todo for hardward optimize.
-#define VER_GEN9 700 // todo for hardward optimize.
-#define VER_GEN12 1000000 // todo for hardward optimize.
-#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
+#define VER_4VEC 610 // todo for hardware optimize.
+#define VER_GEN9 700 // todo for hardware optimize.
+#define VER_GEN12 1000000 // todo for hardware optimize.
+#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardware optimize.
 
 #define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
 
diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp
index d0d5ac9a4e8..14490fea5be 100644
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -29,7 +29,7 @@ namespace ggml_sycl_reordered {
 // [qs0, qs1, qs2, ..., qsN]  [d0, d1, d2, ..., dN]
 //
 // Notes: out-of-bounds qs will run into d values
-// Aligment relies on the allocated size of qs
+// Alignment relies on the allocated size of qs
 
 template <ggml_type type> struct block_q_t;
 
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
index b41124acc13..15d92e5e04c 100644
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -37,7 +37,7 @@ struct soft_max_params {
 };
 
 // When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
-// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wpass-failed"
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
index de01336cd3f..715a263a6d0 100644
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -90,7 +90,7 @@ if (Vulkan_FOUND)
     target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
     # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
-    # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+    # Possibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
     if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
     endif()
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index e0d478ce95d..e954644e28f 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -186,7 +186,7 @@ def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
         # Quick hack to fix the Norway problem
         # https://hitchdev.com/strictyaml/why/implicit-typing-removed/
         yaml_content = yaml_content.replace("- no\n", "- \"no\"\n")
-        # yaml should use 2 spaces insted of tab
+        # yaml should use 2 spaces instead of tab
         # this issue has came up with the Qwen/Qwen3-235B-A22B-Instruct-2507 model card
         #    (I've also sent a pr tp fix the modelcard too)
         yaml_content = yaml_content.replace("\t", "  ")
diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py
index 40d484f4eaa..b77c563ff25 100755
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@@ -164,7 +164,7 @@ def test_get_model_id_components(self):
         self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
                          ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
 
-        # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
+        # Negative size --> output is a LoRA adapter --> prune "LoRA" out of the name to avoid redundancy with the suffix
         self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
                          ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
 
diff --git a/include/llama.h b/include/llama.h
index 077f66dc651..a84d56a8850 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -973,7 +973,7 @@ extern "C" {
 
     // Logits for the ith token. For positive indices, Equivalent to:
     // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
-    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+    // Negative indices can be used to access logits in reverse order, -1 is the last logit.
     // returns NULL for invalid ids.
     LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
 
@@ -988,7 +988,7 @@ extern "C" {
 
     // Get the embeddings for the ith token. For positive indices, Equivalent to:
     // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
-    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
+    // Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
     // shape: [n_embd] (1-dimensional)
     // returns NULL for invalid ids.
     LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@@ -1008,9 +1008,9 @@ extern "C" {
     // Returns LLAMA_TOKEN_NULL if no token was sampled.
     LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
 
-    // Get the backend sampled probabilites for the ith token
+    // Get the backend sampled probabilities for the ith token
     // The index matches llama_get_sampled_token_ith().
-    // Returns NULL if no probabilites were generated.
+    // Returns NULL if no probabilities were generated.
     LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
     LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
 
@@ -1337,7 +1337,7 @@ extern "C" {
                                float   tau,
                                float   eta);
 
-    /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+    /// @details Initializes a GBNF grammar, see grammars/README.md for details.
     /// @param vocab The vocabulary that this grammar will be used with.
     /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
     /// @param grammar_root The name of the start symbol for the grammar.
diff --git a/scripts/pr2wt.sh b/scripts/pr2wt.sh
index 067f5d466b6..e028814c431 100755
--- a/scripts/pr2wt.sh
+++ b/scripts/pr2wt.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# intialize a new worktree from a PR number:
+# initialize a new worktree from a PR number:
 #
 # - creates a new remote using the fork's clone URL
 # - creates a local branch tracking the remote branch
diff --git a/scripts/server-bench.py b/scripts/server-bench.py
index dbbb0939ffe..202c35a486b 100755
--- a/scripts/server-bench.py
+++ b/scripts/server-bench.py
@@ -292,6 +292,6 @@ def benchmark(
         "--n_predict_min", type=int, default=1024,
         help="Min. number of tokens to predict per prompt (supported for synthetic prompts only)")
     parser.add_argument("--seed_offset", type=int, default=0, help="Offset for determining the seeds for pseudorandom prompt/generation lengths. "
-                        "Corelations between seeds can occur when set >= 1000. Negative values mean no seed.")
+                        "Correlations between seeds can occur when set >= 1000. Negative values mean no seed.")
     args = parser.parse_args()
     benchmark(**vars(args))
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 98d055d34ef..eee9021296e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -158,7 +158,7 @@ llama_context::llama_context(
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
 
-    // intialized later
+    // initialized later
     cparams.pipeline_parallel = false;
 
     {
@@ -1981,7 +1981,7 @@ ggml_cgraph * llama_context::graph_reserve(
 
     ggml_backend_sched_reset(sched.get());
 
-    // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
+    // when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
     gf_res_prev->reset();
 
     // store the n_outputs as it is, and restore it afterwards
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 23a86ea2905..b8126ce5081 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1616,7 +1616,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
     // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
     //       but this would make the graph topology depend on the number of output tokens, which can interere with
-    //       features that require constant topology such as pipline parallelism
+    //       features that require constant topology such as pipeline parallelism
     //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
     //if (n_outputs < n_tokens) {
     //    return nullptr;
@@ -1779,7 +1779,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         if (v_mla) {
 #if 0
             // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
-            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
+            // However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
             cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
             cur = ggml_mul_mat(ctx0, v_mla, cur);
 #else
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 6b668ee9abd..4031bafe9ec 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
             break;
         }
 
-        // remeber the position that we found
+        // remember the position that we found
         res.push_back(sinfo_new);
 
         // store the old state of the cells in the recovery stack
@@ -1293,7 +1293,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
     }
 
     for (uint32_t s = 0; s < n_stream; ++s) {
-        // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
+        // bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
         std::unordered_map<llama_seq_id, uint32_t>              seq_srct;
         std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index dabf3b3086e..60b7cc6946f 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1524,7 +1524,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 }
 
                 switch (hparams.n_layer) {
-                    // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
+                    // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
                     case 12: // 900M  8x???M
                     case 32: // 51B  16x?B
                     default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 194eed238ec..ce83361dc79 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1833,7 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-                // correct endiannes of data in precompiled_charsmap binary blob
+                // correct endianness of data in precompiled_charsmap binary blob
                 uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
                 *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
                 assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index b608396e50e..be81709c50b 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                     cb(Qcur, "Qcur_attn_temp_scaled", il);
                 }
 
-                // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
                 cur = build_attn(inp_attn_k,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
diff --git a/src/models/models.h b/src/models/models.h
index 0712d03d8d9..cf9ba04e7f7 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -3,7 +3,7 @@
 #include "llama-model.h"
 #include "llama-graph.h"
 
-// note: almost all graphs require atleast sqrtf, so include cmath globally
+// note: almost all graphs require at least sqrtf, so include cmath globally
 #include <cmath>
 
 //
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 1475b53b659..122c8ca04a5 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -773,7 +773,7 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
         // tiny_aya digit grouping pattern from tokenizer.json:
         //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
         // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
-        // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
+        // TODO: Revisit this regex, in case there are any subtle tokenization differences with the original regex.
         bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
     }
 
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
index 95e09c97b02..7ae739ad2ef 100644
--- a/tests/test-alloc.cpp
+++ b/tests/test-alloc.cpp
@@ -285,7 +285,7 @@ static void test_max_size_too_many_tensors() {
     GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
 }
 
-// Scenario where there is some space left in the first buffer, but not enough to accomodate
+// Scenario where there is some space left in the first buffer, but not enough to accommodate
 // a larger tensor, so a second buffer is required
 static void test_max_size_tensor_too_large() {
     dummy_backend backend      = dummy_backend_init(32);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 0ac21cdcf62..7c6938d447b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1868,9 +1868,9 @@ struct test_case {
 };
 
 
-// ###################################
-// ## Section 2: GGML Op Defintions ##
-// ###################################
+// ####################################
+// ## Section 2: GGML Op Definitions ##
+// ####################################
 
 
 // The following is an example showing the bare minimum for creating a test for a GGML op.
@@ -6222,7 +6222,7 @@ struct test_flash_attn_ext : public test_case {
     void initialize_tensors(ggml_context * ctx) override {
         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
             if (strcmp(t->name, "s") == 0) {
-                // make the sink values more noticable in order to trigger a test failure when the implementation is wrong
+                // make the sink values more noticeable in order to trigger a test failure when the implementation is wrong
                 init_tensor_uniform(t, -10.0f, 10.0f);
             } else if (strcmp(t->name, "m") == 0) {
                 init_tensor_kq_mask(t);
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index f3d19118b58..46aec8395fa 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -438,7 +438,7 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
 }
 
 /**
- * Test if streaming=true is consistant with streaming=false for given partial parser
+ * Test if streaming=true is consistent with streaming=false for given partial parser
  * Also test if there is any problem with partial message
  */
 template <typename T>
diff --git a/tools/completion/README.md b/tools/completion/README.md
index bcc08876592..f868c2c7d7d 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -480,7 +480,7 @@ Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
 
 Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
 
-By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repetition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
 
 Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
 
@@ -531,7 +531,7 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 ### NUMA support
 
--   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
+-   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
 -   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
 -   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
 
diff --git a/tools/cvector-generator/cvector-generator.cpp b/tools/cvector-generator/cvector-generator.cpp
index 4c8ca61ec4f..dcce0e98418 100644
--- a/tools/cvector-generator/cvector-generator.cpp
+++ b/tools/cvector-generator/cvector-generator.cpp
@@ -110,7 +110,7 @@ struct callback_data {
             auto diff_filtered = filter_nonzero_rows(v_pos[il]);
             v_diff_filtered.push_back(diff_filtered);
         }
-        return v_diff_filtered; // for convinient, we return the result std::vector
+        return v_diff_filtered; // for convenient, we return the result std::vector
     }
 
     // delete zero rows from a given 2D tensor
diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md
index 4505cb4ce8c..4cbe4fd0cf7 100644
--- a/tools/imatrix/README.md
+++ b/tools/imatrix/README.md
@@ -95,4 +95,4 @@ Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated.
 #### Important note on the computed Statistics
 
 When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**.
-Whilst the results are still useful, they're less realiable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors.
+Whilst the results are still useful, they're less reliable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors.
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index a30c32ed42b..0c3cf8670a4 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -68,7 +68,7 @@
 
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
-#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
+#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backward compat
 #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
 #define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_NORM_EMBD       "v.norm_embd.%s"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index e0eb9b32c8f..eeb8da58e08 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -46,7 +46,7 @@ struct clip_hparams {
     float image_std[3];
 
     // for models using dynamic image size, we need to have a smaller image size to warmup
-    // otherwise, user will get OOM everytime they load the model
+    // otherwise, user will get OOM every time they load the model
     int32_t warmup_image_size = 0;
     int32_t warmup_audio_size = 3000;
 
@@ -221,7 +221,7 @@ struct clip_model {
     // embeddings
     ggml_tensor * class_embedding = nullptr;
     ggml_tensor * patch_embeddings_0 = nullptr;
-    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temporal dimension (Qwen2VL)
     ggml_tensor * patch_bias = nullptr;
     ggml_tensor * position_embeddings = nullptr;
     ggml_tensor * norm_embd_w = nullptr;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 607d4b83731..b70bad33b68 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2287,7 +2287,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32
     }
 }
 
-// set of tools to manupulate images
+// set of tools to manipulate images
 // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
 struct img_tool {
     enum resize_algo {
diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
index 944037e703e..1f563fbfc59 100644
--- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
+++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -186,7 +186,7 @@ def trunc_normal_tf_(
     best when :math:`a \\leq \text{mean} \\leq b`.
     NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
     bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
+    and the result is subsequently scaled and shifted by the mean and std args.
     Args:
         tensor: an n-dimensional `torch.Tensor`
         mean: the mean of the normal distribution
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index e8eef035ff5..447f61aaa40 100644
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -560,7 +560,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
     for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
         int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
         if ((size_t) n_len < frames_per_chunk) {
-            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+            break;  // last incomplete chunk will always be a padded chunk, safe to ignore
         }
 
         mtmd_audio_mel out_chunk;
diff --git a/tools/perplexity/README.md b/tools/perplexity/README.md
index eb3846072ea..f82d34c8a25 100644
--- a/tools/perplexity/README.md
+++ b/tools/perplexity/README.md
@@ -27,10 +27,10 @@ In addition to the KL divergence the following statistics are calculated with `-
 * Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
 * Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
 * Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
-* Pearson correlation coefficient of the "correct" token probabilites between models.
+* Pearson correlation coefficient of the "correct" token probabilities between models.
 * Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
 * The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggml-org/llama.cpp/discussions/2875 .
-* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
+* Same top p: Percentage of how often the token was assigned the highest probabilities by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
 
 ## LLaMA 3 8b Scoreboard
 
diff --git a/tools/quantize/README.md b/tools/quantize/README.md
index 22f07102867..b8c225124b3 100644
--- a/tools/quantize/README.md
+++ b/tools/quantize/README.md
@@ -100,7 +100,7 @@ Examples:
 ## Memory/Disk Requirements
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
-As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1):
+As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For example (Llama 3.1):
 
 | Model | Original size | Quantized size (Q4_K_M) |
 | ----: | ------------: | ----------------------: |
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index a5465fcd132..77362ce66de 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/public_legacy/index-new.html b/tools/server/public_legacy/index-new.html
index e2f39d6687e..2cee7f3c3c4 100644
--- a/tools/server/public_legacy/index-new.html
+++ b/tools/server/public_legacy/index-new.html
@@ -36,7 +36,7 @@
 
     const params = signal({
       n_predict: 358, // 358 is a nice number
-      temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
+      temperature: 0.8, // adapt all following parameters to optimized min-p requirements. If for non-english, set to 0.6 or lower
       repeat_last_n: 0, // 0 = disable penalty, -1 = context size
       repeat_penalty: 1.0, // 1.0 = disabled
       dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
@@ -108,7 +108,7 @@
     let importedTemplates = local_storage_getDataAsObject('user_templates')
 
     if (importedTemplates) {
-      // saved templates were successfuly imported.
+      // saved templates were successfully imported.
 
       console.log('Processing saved templates and updating default template')
       params.value = { ...params.value, image_data: [] };
@@ -129,7 +129,7 @@
     }
 
     function userTemplateResetToDefault() {
-      console.log('Reseting themplate to default')
+      console.log('Reseting template to default')
       selectedUserTemplate.value.name = 'default';
       selectedUserTemplate.value.data = savedUserTemplates.value['default'];
     }
diff --git a/tools/server/public_simplechat/datautils.mjs b/tools/server/public_simplechat/datautils.mjs
index 75159d6b167..08ccc219bfd 100644
--- a/tools/server/public_simplechat/datautils.mjs
+++ b/tools/server/public_simplechat/datautils.mjs
@@ -63,7 +63,7 @@ export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold
 
 
 /**
- * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
+ * Simple minded logic to help remove repeating garbage at end of the string, till it can't.
  * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
  * This ensures that even if there are multiple runs of garbage with different patterns, the
  * logic still tries to munch through them.
diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md
index 24e026d455b..cc86d62494c 100644
--- a/tools/server/public_simplechat/readme.md
+++ b/tools/server/public_simplechat/readme.md
@@ -30,7 +30,7 @@ The UI follows a responsive web design so that the layout can adapt to available
 enough manner, in general.
 
 Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
-console. Parallely some of the directly useful to end-user settings can also be changed using the provided
+console. Parallelly some of the directly useful to end-user settings can also be changed using the provided
 settings ui.
 
 NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide
@@ -38,7 +38,7 @@ any adaptive culling of old messages nor of replacing them with summary of their
 is a optional sliding window based chat logic, which provides a simple minded culling of old messages from
 the chat history before sending to the ai model.
 
-NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now.
+NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionally stream for now.
 However if someone wants they can update the js file or equivalent member in gMe as needed.
 
 NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very
@@ -88,7 +88,7 @@ Once inside
     then the end user needs to enter the same.
     This keeps the logic simple, while still giving flexibility to the end user to
     manage any templating/tagging requirement wrt their messages to the model.
-  * the logic doesnt insert newline at the begining and end wrt the prompt message generated.
+  * the logic doesnt insert newline at the beginning and end wrt the prompt message generated.
     However if the chat being sent to /completions end point has more than one role's message,
     then insert newline when moving from one role's message to the next role's message, so
     that it can be clearly identified/distinguished.
@@ -101,8 +101,8 @@ Once inside
   Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
   responses with a suitable system prompt.
   * if chat.add_system_begin is used
-    * you cant change the system prompt, after it is has been submitted once along with user query.
-    * you cant set a system prompt, after you have submitted any user query
+    * you can't change the system prompt, after it is has been submitted once along with user query.
+    * you can't set a system prompt, after you have submitted any user query
   * if chat.add_system_anytime is used
     * one can change the system prompt any time during chat, by changing the contents of system prompt.
     * inturn the updated/changed system prompt will be inserted into the chat session.
@@ -129,7 +129,7 @@ Once inside
 
 ### Reason behind this
 
-The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
+The idea is to be easy enough to use for basic purposes, while also being simple and easily discernible
 by developers who may not be from web frontend background (so inturn may not be familiar with template /
 end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
 
@@ -167,7 +167,7 @@ It is attached to the document object. Some of these can also be updated using t
   messages that get inserted into prompt field wrt /Completion endpoint.
 
   bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be
-  trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of
+  trimmed or left as is. If enabled, it will be trimmed so that it won't be sent back as part of
   subsequent chat history. At the same time the actual trimmed text is shown to the user, once
   when it was generated, so user can check if any useful info/data was there in the response.
 
@@ -244,7 +244,7 @@ full chat history. This way if there is any response with garbage/repeatation, i
 mess with things beyond the next question/request/query, in some ways. The trim garbage
 option also tries to help avoid issues with garbage in the context to an extent.
 
-Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
+Set max_tokens to 1024, so that a relatively large previous response doesnt eat up the space
 available wrt next query-response. However dont forget that the server when started should
 also be started with a model context size of 1k or more, to be on safe side.
 
diff --git a/tools/server/public_simplechat/simplechat.js b/tools/server/public_simplechat/simplechat.js
index 2fcd24a860b..c67577d5ae7 100644
--- a/tools/server/public_simplechat/simplechat.js
+++ b/tools/server/public_simplechat/simplechat.js
@@ -318,7 +318,7 @@ class SimpleChat {
     }
 
     /**
-     * Allow setting of system prompt, but only at begining.
+     * Allow setting of system prompt, but only at beginning.
      * @param {string} sysPrompt
      * @param {string} msgTag
      */
@@ -333,7 +333,7 @@ class SimpleChat {
                     console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
                 } else {
                     if (this.xchat[0].content !== sysPrompt) {
-                        console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
+                        console.error(`ERRR:SimpleChat:SC:${msgTag}:You can't change system prompt, mid way through, ignoring...`);
                     }
                 }
             }
diff --git a/tools/server/public_simplechat/ui.mjs b/tools/server/public_simplechat/ui.mjs
index b2d5b9aeab7..afa619a0663 100644
--- a/tools/server/public_simplechat/ui.mjs
+++ b/tools/server/public_simplechat/ui.mjs
@@ -44,7 +44,7 @@ export function el_create_button(id, callback, name=undefined, innerText=undefin
 }
 
 /**
- * Create a para and set it up. Optionaly append it to a passed parent.
+ * Create a para and set it up. Optionally append it to a passed parent.
  * @param {string} text
  * @param {HTMLElement | undefined} elParent
  * @param {string | undefined} id
@@ -111,7 +111,7 @@ export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, clas
 /**
  * Create a select ui element, with a set of options to select from.
  * * options: an object which contains name-value pairs
- * * defaultOption: the value whose name should be choosen, by default.
+ * * defaultOption: the value whose name should be chosen, by default.
  * * cb : the call back returns the name string of the option selected.
  *
  * @param {string} id
diff --git a/tools/server/tests/README.md b/tools/server/tests/README.md
index a60d3f8ea1a..f566b43644b 100644
--- a/tools/server/tests/README.md
+++ b/tools/server/tests/README.md
@@ -57,7 +57,7 @@ To run a single test:
 ./tests.sh unit/test_chat_completion.py::test_invalid_chat_completion_req
 ```
 
-Hint: You can compile and run test in single command, useful for local developement:
+Hint: You can compile and run test in single command, useful for local development:
 
 ```shell
 cmake --build build -j --target llama-server && ./tools/server/tests/tests.sh
diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte
index 4494ea880b9..c676e224a72 100644
--- a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte
+++ b/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte
@@ -8,6 +8,7 @@
 		tooltip: string;
 		variant?: 'default' | 'destructive' | 'outline' | 'secondary' | 'ghost' | 'link';
 		size?: 'default' | 'sm' | 'lg' | 'icon';
+		iconSize?: string;
 		class?: string;
 		disabled?: boolean;
 		onclick: () => void;
@@ -21,6 +22,7 @@
 		size = 'sm',
 		class: className = '',
 		disabled = false,
+		iconSize = 'h-3 w-3',
 		onclick,
 		'aria-label': ariaLabel
 	}: Props = $props();
@@ -38,7 +40,7 @@
 		>
 			{@const IconComponent = icon}
 
-			<IconComponent class="h-3 w-3" />
+			<IconComponent class={iconSize} />
 		</Button>
 	</Tooltip.Trigger>
 
diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte b/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte
index a0d5e863c2a..15936691a6a 100644
--- a/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte
+++ b/tools/server/webui/src/lib/components/app/badges/BadgeModality.svelte
@@ -1,6 +1,6 @@
 <script lang="ts">
 	import { ModelModality } from '$lib/enums';
-	import { MODALITY_ICONS, MODALITY_LABELS } from '$lib/constants/icons';
+	import { MODALITY_ICONS, MODALITY_LABELS } from '$lib/constants';
 	import { cn } from '$lib/components/ui/utils';
 
 	type DisplayableModality = ModelModality.VISION | ModelModality.AUDIO;
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 3551b0b3d60..37888d92e53 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -5,9 +5,11 @@
 		ChatFormFileInputInvisible,
 		ChatFormTextarea
 	} from '$lib/components/app';
-	import { INPUT_CLASSES } from '$lib/constants/css-classes';
-	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
-	import { CLIPBOARD_CONTENT_QUOTE_PREFIX } from '$lib/constants/chat-form';
+	import {
+		CLIPBOARD_CONTENT_QUOTE_PREFIX,
+		INPUT_CLASSES,
+		SETTING_CONFIG_DEFAULT
+	} from '$lib/constants';
 	import { KeyboardKey, MimeTypeText } from '$lib/enums';
 	import { config } from '$lib/stores/settings.svelte';
 	import { modelOptions, selectedModelId } from '$lib/stores/models.svelte';
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAttachmentsDropdown.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAttachmentsDropdown.svelte
index b1cff67dcb5..86dca844829 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAttachmentsDropdown.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAttachmentsDropdown.svelte
@@ -4,8 +4,7 @@
 	import { Button } from '$lib/components/ui/button';
 	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
 	import * as Tooltip from '$lib/components/ui/tooltip';
-	import { FILE_TYPE_ICONS } from '$lib/constants/icons';
-	import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
+	import { FILE_TYPE_ICONS, TOOLTIP_DELAY_DURATION } from '$lib/constants';
 
 	interface Props {
 		class?: string;
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
index 54b11c86249..c94fe267d53 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
@@ -11,7 +11,7 @@
 	import { getFileTypeCategory } from '$lib/utils';
 	import { config } from '$lib/stores/settings.svelte';
 	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
-	import { isRouterMode } from '$lib/stores/server.svelte';
+	import { isRouterMode, serverError } from '$lib/stores/server.svelte';
 	import { chatStore } from '$lib/stores/chat.svelte';
 	import { activeMessages } from '$lib/stores/conversations.svelte';
 
@@ -45,6 +45,7 @@
 
 	let currentConfig = $derived(config());
 	let isRouter = $derived(isRouterMode());
+	let isOffline = $derived(!!serverError());
 
 	let conversationModel = $derived(
 		chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
@@ -55,7 +56,10 @@
 	$effect(() => {
 		if (conversationModel && conversationModel !== previousConversationModel) {
 			previousConversationModel = conversationModel;
-			modelsStore.selectModelByName(conversationModel);
+
+			if (!isRouter || modelsStore.isModelLoaded(conversationModel)) {
+				modelsStore.selectModelByName(conversationModel);
+			}
 		}
 	});
 
@@ -168,9 +172,9 @@
 
 	<div class="ml-auto flex items-center gap-1.5">
 		<ModelsSelector
-			{disabled}
 			bind:this={selectorModelRef}
 			currentModel={conversationModel}
+			disabled={disabled || isOffline}
 			forceForegroundText={true}
 			useGlobalSelection={true}
 		/>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
index ebf7f433d1b..ba8990f0129 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -5,7 +5,7 @@
 	import { chatStore, pendingEditMessageId } from '$lib/stores/chat.svelte';
 	import { conversationsStore } from '$lib/stores/conversations.svelte';
 	import { DatabaseService } from '$lib/services';
-	import { SYSTEM_MESSAGE_PLACEHOLDER } from '$lib/constants/ui';
+	import { SYSTEM_MESSAGE_PLACEHOLDER } from '$lib/constants';
 	import { MessageRole } from '$lib/enums';
 	import {
 		ChatMessageAssistant,
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
index 263f90ec804..5eb16f53a88 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -16,14 +16,13 @@
 	import { Check, X } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
 	import { Checkbox } from '$lib/components/ui/checkbox';
-	import { INPUT_CLASSES } from '$lib/constants/css-classes';
+	import { INPUT_CLASSES, REASONING_TAGS } from '$lib/constants';
 	import { MessageRole, KeyboardKey } from '$lib/enums';
 	import Label from '$lib/components/ui/label/label.svelte';
 	import { config } from '$lib/stores/settings.svelte';
 	import { isRouterMode } from '$lib/stores/server.svelte';
 	import { modelsStore } from '$lib/stores/models.svelte';
 	import { ServerModelStatus } from '$lib/enums';
-	import { REASONING_TAGS } from '$lib/constants/agentic';
 
 	interface Props {
 		class?: string;
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
index 77951e9d2a3..d74ecd782a8 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
@@ -4,7 +4,7 @@
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { ChatMessageStatsView } from '$lib/enums';
 	import { formatPerformanceTime } from '$lib/utils';
-	import { MS_PER_SECOND, DEFAULT_PERFORMANCE_TIME } from '$lib/constants/formatters';
+	import { MS_PER_SECOND, DEFAULT_PERFORMANCE_TIME } from '$lib/constants';
 
 	interface Props {
 		predictedTokens?: number;
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
index aec2d90c028..9dadf3231b6 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
@@ -4,7 +4,7 @@
 	import { Button } from '$lib/components/ui/button';
 	import { MarkdownContent } from '$lib/components/app';
 	import { getMessageEditContext } from '$lib/contexts';
-	import { INPUT_CLASSES } from '$lib/constants/css-classes';
+	import { INPUT_CLASSES } from '$lib/constants';
 	import { config } from '$lib/stores/settings.svelte';
 	import { isIMEComposing } from '$lib/utils';
 	import ChatMessageActions from './ChatMessageActions.svelte';
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
index ceecf03e54d..4efe2b212b4 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
@@ -12,7 +12,7 @@
 	} from '$lib/components/app';
 	import * as Alert from '$lib/components/ui/alert';
 	import * as AlertDialog from '$lib/components/ui/alert-dialog';
-	import { INITIAL_SCROLL_DELAY } from '$lib/constants/auto-scroll';
+	import { INITIAL_SCROLL_DELAY } from '$lib/constants';
 	import { KeyboardKey } from '$lib/enums';
 	import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
 	import {
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
index cc7b22cfd8d..8c88480cef5 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
@@ -1,6 +1,6 @@
 <script lang="ts">
 	import { untrack } from 'svelte';
-	import { PROCESSING_INFO_TIMEOUT } from '$lib/constants/processing-info';
+	import { PROCESSING_INFO_TIMEOUT } from '$lib/constants';
 	import { useProcessingState } from '$lib/hooks/use-processing-state.svelte';
 	import { chatStore, isLoading, isChatStreaming } from '$lib/stores/chat.svelte';
 	import { activeMessages, activeConversation } from '$lib/stores/conversations.svelte';
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
index 2130658dda5..ce2925b020e 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
@@ -18,15 +18,16 @@
 	import { config, settingsStore } from '$lib/stores/settings.svelte';
 	import {
 		SETTINGS_SECTION_TITLES,
-		type SettingsSectionTitle
-	} from '$lib/constants/settings-sections';
+		type SettingsSectionTitle,
+		NUMERIC_FIELDS,
+		POSITIVE_INTEGER_FIELDS,
+		SETTINGS_COLOR_MODES_CONFIG,
+		SETTINGS_KEYS
+	} from '$lib/constants';
 	import { setMode } from 'mode-watcher';
 	import { ColorMode } from '$lib/enums/ui';
 	import { SettingsFieldType } from '$lib/enums/settings';
 	import type { Component } from 'svelte';
-	import { NUMERIC_FIELDS, POSITIVE_INTEGER_FIELDS } from '$lib/constants/settings-fields';
-	import { SETTINGS_COLOR_MODES_CONFIG } from '$lib/constants/settings-config';
-	import { SETTINGS_KEYS } from '$lib/constants/settings-keys';
 
 	interface Props {
 		onSave?: () => void;
@@ -133,6 +134,11 @@
 					key: SETTINGS_KEYS.AUTO_SHOW_SIDEBAR_ON_NEW_CHAT,
 					label: 'Auto-show sidebar on new chat',
 					type: SettingsFieldType.CHECKBOX
+				},
+				{
+					key: SETTINGS_KEYS.SHOW_RAW_MODEL_NAMES,
+					label: 'Show raw model names',
+					type: SettingsFieldType.CHECKBOX
 				}
 			]
 		},
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
index 07749944118..b9015c196c1 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
@@ -5,8 +5,7 @@
 	import Label from '$lib/components/ui/label/label.svelte';
 	import * as Select from '$lib/components/ui/select';
 	import { Textarea } from '$lib/components/ui/textarea';
-	import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
-	import { SETTINGS_KEYS } from '$lib/constants/settings-keys';
+	import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants';
 	import { SettingsFieldType } from '$lib/enums/settings';
 	import { settingsStore } from '$lib/stores/settings.svelte';
 	import { ChatSettingsParameterSourceIndicator } from '$lib/components/app';
diff --git a/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
index a0944e18a07..e4155afc5f4 100644
--- a/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
+++ b/tools/server/webui/src/lib/components/app/content/MarkdownContent.svelte
@@ -22,8 +22,9 @@
 		IMAGE_NOT_ERROR_BOUND_SELECTOR,
 		DATA_ERROR_BOUND_ATTR,
 		DATA_ERROR_HANDLED_ATTR,
-		BOOL_TRUE_STRING
-	} from '$lib/constants/markdown';
+		BOOL_TRUE_STRING,
+		SETTINGS_KEYS
+	} from '$lib/constants';
 	import { UrlPrefix } from '$lib/enums';
 	import { FileTypeText } from '$lib/enums/files';
 	import {
@@ -39,7 +40,6 @@
 	import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
 	import type { DatabaseMessageExtra } from '$lib/types/database';
 	import { config } from '$lib/stores/settings.svelte';
-	import { SETTINGS_KEYS } from '$lib/constants/settings-keys';
 
 	interface Props {
 		attachments?: DatabaseMessageExtra[];
diff --git a/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte b/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte
index 7b1e598ce72..793af9a40f2 100644
--- a/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte
+++ b/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte
@@ -1,7 +1,7 @@
 <script lang="ts">
 	import * as Dialog from '$lib/components/ui/dialog';
 	import { ChatSettings } from '$lib/components/app';
-	import type { SettingsSectionTitle } from '$lib/constants/settings-sections';
+	import type { SettingsSectionTitle } from '$lib/constants';
 
 	interface Props {
 		onOpenChange?: (open: boolean) => void;
diff --git a/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte b/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte
index f98ba7d78d7..cc1d1848e4b 100644
--- a/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte
+++ b/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte
@@ -1,6 +1,7 @@
 <script lang="ts">
 	import { Package } from '@lucide/svelte';
 	import { BadgeInfo, ActionIconCopyToClipboard } from '$lib/components/app';
+	import ModelId from './ModelId.svelte';
 	import { modelsStore } from '$lib/stores/models.svelte';
 	import { serverStore } from '$lib/stores/server.svelte';
 	import * as Tooltip from '$lib/components/ui/tooltip';
@@ -23,6 +24,7 @@
 
 	let model = $derived(modelProp || modelsStore.singleModelName);
 	let isModelMode = $derived(serverStore.isModelMode);
+	let shouldShow = $derived(model && (modelProp !== undefined || isModelMode));
 </script>
 
 {#snippet badgeContent()}
@@ -31,7 +33,9 @@
 			<Package class="h-3 w-3" />
 		{/snippet}
 
-		{model}
+		{#if model}
+			<ModelId modelId={model} />
+		{/if}
 
 		{#if showCopyIcon}
 			<ActionIconCopyToClipboard text={model || ''} ariaLabel="Copy model name" />
@@ -39,7 +43,7 @@
 	</BadgeInfo>
 {/snippet}
 
-{#if model && isModelMode}
+{#if shouldShow}
 	{#if showTooltip}
 		<Tooltip.Root>
 			<Tooltip.Trigger>
diff --git a/tools/server/webui/src/lib/components/app/models/ModelId.svelte b/tools/server/webui/src/lib/components/app/models/ModelId.svelte
new file mode 100644
index 00000000000..817e8828615
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/models/ModelId.svelte
@@ -0,0 +1,64 @@
+<script lang="ts">
+	import { ModelsService } from '$lib/services/models.service';
+	import { config } from '$lib/stores/settings.svelte';
+
+	interface Props {
+		modelId: string;
+		showOrgName?: boolean;
+		showRaw?: boolean;
+		aliases?: string[];
+		tags?: string[];
+		class?: string;
+	}
+
+	let {
+		modelId,
+		showOrgName = false,
+		showRaw = undefined,
+		aliases,
+		tags,
+		class: className = ''
+	}: Props = $props();
+
+	const badgeClass =
+		'inline-flex w-fit shrink-0 items-center justify-center whitespace-nowrap rounded-md border border-border/50 px-1 py-0 text-[10px] font-mono bg-foreground/15 dark:bg-foreground/10 text-foreground [a&]:hover:bg-foreground/25';
+	const tagBadgeClass =
+		'inline-flex w-fit shrink-0 items-center justify-center whitespace-nowrap rounded-md border border-border/50 px-1 py-0 text-[10px] font-mono text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground';
+
+	let parsed = $derived(ModelsService.parseModelId(modelId));
+	let resolvedShowRaw = $derived(showRaw ?? (config().showRawModelNames as boolean) ?? false);
+</script>
+
+{#if resolvedShowRaw}
+	<span class="min-w-0 truncate font-medium {className}">{modelId}</span>
+{:else}
+	<span class="flex min-w-0 flex-wrap items-center gap-1 {className}">
+		<span class="min-w-0 truncate font-medium">
+			{#if showOrgName}{parsed.orgName}/{/if}{parsed.modelName ?? modelId}
+		</span>
+
+		{#if parsed.params}
+			<span class={badgeClass}>
+				{parsed.params}{parsed.activatedParams ? `-${parsed.activatedParams}` : ''}
+			</span>
+		{/if}
+
+		{#if parsed.quantization}
+			<span class={badgeClass}>
+				{parsed.quantization}
+			</span>
+		{/if}
+
+		{#if aliases && aliases.length > 0}
+			{#each aliases as alias (alias)}
+				<span class={badgeClass}>{alias}</span>
+			{/each}
+		{/if}
+
+		{#if tags && tags.length > 0}
+			{#each tags as tag (tag)}
+				<span class={tagBadgeClass}>{tag}</span>
+			{/each}
+		{/if}
+	</span>
+{/if}
diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
index ebffae12120..a40501e2cce 100644
--- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
+++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
@@ -1,6 +1,7 @@
 <script lang="ts">
 	import { onMount } from 'svelte';
-	import { ChevronDown, Loader2, Package, Power } from '@lucide/svelte';
+	import { SvelteMap } from 'svelte/reactivity';
+	import { ChevronDown, Loader2, Package } from '@lucide/svelte';
 	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { cn } from '$lib/components/ui/utils';
@@ -10,35 +11,33 @@
 		modelsLoading,
 		modelsUpdating,
 		selectedModelId,
-		routerModels,
 		singleModelName
 	} from '$lib/stores/models.svelte';
-	import { KeyboardKey, ServerModelStatus } from '$lib/enums';
+	import { KeyboardKey } from '$lib/enums';
 	import { isRouterMode } from '$lib/stores/server.svelte';
 	import {
 		DialogModelInformation,
 		DropdownMenuSearchable,
-		TruncatedText
+		ModelId,
+		ModelsSelectorOption
 	} from '$lib/components/app';
 	import type { ModelOption } from '$lib/types/models';
 
 	interface Props {
 		class?: string;
 		currentModel?: string | null;
-		/** Callback when model changes. Return false to keep menu open (e.g., for validation failures) */
-		onModelChange?: (modelId: string, modelName: string) => Promise<boolean> | boolean | void;
 		disabled?: boolean;
 		forceForegroundText?: boolean;
-		/** When true, user's global selection takes priority over currentModel (for form selector) */
+		onModelChange?: (modelId: string, modelName: string) => Promise<boolean> | boolean | void;
 		useGlobalSelection?: boolean;
 	}
 
 	let {
 		class: className = '',
 		currentModel = null,
-		onModelChange,
 		disabled = false,
 		forceForegroundText = false,
+		onModelChange,
 		useGlobalSelection = false
 	}: Props = $props();
 
@@ -55,46 +54,109 @@
 	let isRouter = $derived(isRouterMode());
 	let serverModel = $derived(singleModelName());
 
-	// Reactive router models state - needed for proper reactivity of status checks
-	let currentRouterModels = $derived(routerModels());
+	let isHighlightedCurrentModelActive = $derived.by(() => {
+		if (!isRouter || !currentModel) return false;
 
-	function getModelStatus(modelId: string): ServerModelStatus | null {
-		const model = currentRouterModels.find((m) => m.id === modelId);
-		return (model?.status?.value as ServerModelStatus) ?? null;
-	}
-
-	let isHighlightedCurrentModelActive = $derived(
-		!isRouter || !currentModel
-			? false
-			: (() => {
-					const currentOption = options.find((option) => option.model === currentModel);
+		const currentOption = options.find((option) => option.model === currentModel);
 
-					return currentOption ? currentOption.id === activeId : false;
-				})()
-	);
+		return currentOption ? currentOption.id === activeId : false;
+	});
 
-	let isCurrentModelInCache = $derived(() => {
+	let isCurrentModelInCache = $derived.by(() => {
 		if (!isRouter || !currentModel) return true;
 
 		return options.some((option) => option.model === currentModel);
 	});
 
+	let isLoadingModel = $state(false);
+
 	let searchTerm = $state('');
 	let highlightedIndex = $state<number>(-1);
 
-	let filteredOptions: ModelOption[] = $derived(
-		(() => {
-			const term = searchTerm.trim().toLowerCase();
-			if (!term) return options;
+	let filteredOptions: ModelOption[] = $derived.by(() => {
+		const term = searchTerm.trim().toLowerCase();
+		if (!term) return options;
+
+		return options.filter(
+			(option) =>
+				option.model.toLowerCase().includes(term) ||
+				option.name?.toLowerCase().includes(term) ||
+				option.aliases?.some((alias: string) => alias.toLowerCase().includes(term)) ||
+				option.tags?.some((tag: string) => tag.toLowerCase().includes(term))
+		);
+	});
 
-			return options.filter(
-				(option) =>
-					option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
-			);
-		})()
-	);
+	let groupedFilteredOptions = $derived.by(() => {
+		const favIds = modelsStore.favouriteModelIds;
+		const result: {
+			orgName: string | null;
+			isFavouritesGroup: boolean;
+			isLoadedGroup: boolean;
+			items: { option: ModelOption; flatIndex: number }[];
+		}[] = [];
+
+		// Loaded models group (top)
+		const loadedItems: { option: ModelOption; flatIndex: number }[] = [];
+		for (let i = 0; i < filteredOptions.length; i++) {
+			if (modelsStore.isModelLoaded(filteredOptions[i].model)) {
+				loadedItems.push({ option: filteredOptions[i], flatIndex: i });
+			}
+		}
+
+		if (loadedItems.length > 0) {
+			result.push({
+				orgName: null,
+				isFavouritesGroup: false,
+				isLoadedGroup: true,
+				items: loadedItems
+			});
+		}
+
+		// Favourites group
+		const loadedModelIds = new Set(loadedItems.map((item) => item.option.model));
+		const favItems: { option: ModelOption; flatIndex: number }[] = [];
+		for (let i = 0; i < filteredOptions.length; i++) {
+			if (favIds.has(filteredOptions[i].model) && !loadedModelIds.has(filteredOptions[i].model)) {
+				favItems.push({ option: filteredOptions[i], flatIndex: i });
+			}
+		}
+
+		if (favItems.length > 0) {
+			result.push({
+				orgName: null,
+				isFavouritesGroup: true,
+				isLoadedGroup: false,
+				items: favItems
+			});
+		}
+
+		// Org groups (excluding loaded and favourites)
+		const orgGroups = new SvelteMap<string, { option: ModelOption; flatIndex: number }[]>();
+		for (let i = 0; i < filteredOptions.length; i++) {
+			const option = filteredOptions[i];
+
+			if (loadedModelIds.has(option.model) || favIds.has(option.model)) continue;
+
+			const orgName = option.parsedId?.orgName ?? null;
+			const key = orgName ?? '';
+
+			if (!orgGroups.has(key)) orgGroups.set(key, []);
+
+			orgGroups.get(key)!.push({ option, flatIndex: i });
+		}
+
+		for (const [orgName, items] of orgGroups) {
+			result.push({
+				orgName: orgName || null,
+				isFavouritesGroup: false,
+				isLoadedGroup: false,
+				items
+			});
+		}
+
+		return result;
+	});
 
-	// Reset highlighted index when search term changes
 	$effect(() => {
 		void searchTerm;
 		highlightedIndex = -1;
@@ -109,8 +171,6 @@
 		});
 	});
 
-	// Handle changes to the model selector dropdown or the model dialog, depending on if the server is in
-	// router mode or not.
 	function handleOpenChange(open: boolean) {
 		if (loading || updating) return;
 
@@ -142,6 +202,7 @@
 
 		if (event.key === KeyboardKey.ARROW_DOWN) {
 			event.preventDefault();
+
 			if (filteredOptions.length === 0) return;
 
 			if (highlightedIndex === -1 || highlightedIndex === filteredOptions.length - 1) {
@@ -151,6 +212,7 @@
 			}
 		} else if (event.key === KeyboardKey.ARROW_UP) {
 			event.preventDefault();
+
 			if (filteredOptions.length === 0) return;
 
 			if (highlightedIndex === -1 || highlightedIndex === 0) {
@@ -160,11 +222,12 @@
 			}
 		} else if (event.key === KeyboardKey.ENTER) {
 			event.preventDefault();
+
 			if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
 				const option = filteredOptions[highlightedIndex];
+
 				handleSelect(option.id);
 			} else if (filteredOptions.length > 0) {
-				// No selection - highlight first option
 				highlightedIndex = 0;
 			}
 		}
@@ -177,31 +240,18 @@
 		let shouldCloseMenu = true;
 
 		if (onModelChange) {
-			// If callback provided, use it (for regenerate functionality)
 			const result = await onModelChange(option.id, option.model);
 
-			// If callback returns false, keep menu open (validation failed)
 			if (result === false) {
 				shouldCloseMenu = false;
 			}
 		} else {
-			// Update global selection
 			await modelsStore.selectModelById(option.id);
-
-			// Load the model if not already loaded (router mode)
-			if (isRouter && getModelStatus(option.model) !== ServerModelStatus.LOADED) {
-				try {
-					await modelsStore.loadModel(option.model);
-				} catch (error) {
-					console.error('Failed to load model:', error);
-				}
-			}
 		}
 
 		if (shouldCloseMenu) {
 			handleOpenChange(false);
 
-			// Focus the chat textarea after model selection
 			requestAnimationFrame(() => {
 				const textarea = document.querySelector<HTMLTextAreaElement>(
 					'[data-slot="chat-form"] textarea'
@@ -209,32 +259,39 @@
 				textarea?.focus();
 			});
 		}
+
+		if (!onModelChange && isRouter && !modelsStore.isModelLoaded(option.model)) {
+			isLoadingModel = true;
+			modelsStore
+				.loadModel(option.model)
+				.catch((error) => console.error('Failed to load model:', error))
+				.finally(() => (isLoadingModel = false));
+		}
 	}
 
 	function getDisplayOption(): ModelOption | undefined {
 		if (!isRouter) {
-			if (serverModel) {
+			const displayModel = serverModel || currentModel;
+			if (displayModel) {
 				return {
-					id: 'current',
-					model: serverModel,
-					name: serverModel.split('/').pop() || serverModel,
-					capabilities: [] // Empty array for single model mode
+					id: serverModel ? 'current' : 'offline-current',
+					model: displayModel,
+					name: displayModel.split('/').pop() || displayModel,
+					capabilities: []
 				};
 			}
 
 			return undefined;
 		}
 
-		// When useGlobalSelection is true (form selector), prioritize user selection
-		// Otherwise (message display), prioritize currentModel
 		if (useGlobalSelection && activeId) {
 			const selected = options.find((option) => option.id === activeId);
+
 			if (selected) return selected;
 		}
 
-		// Show currentModel (from message payload or conversation)
 		if (currentModel) {
-			if (!isCurrentModelInCache()) {
+			if (!isCurrentModelInCache) {
 				return {
 					id: 'not-in-cache',
 					model: currentModel,
@@ -246,12 +303,10 @@
 			return options.find((option) => option.model === currentModel);
 		}
 
-		// Fallback to user selection (for new chats before first message)
 		if (activeId) {
 			return options.find((option) => option.id === activeId);
 		}
 
-		// No selection - return undefined to show "Select model"
 		return undefined;
 	}
 </script>
@@ -260,10 +315,25 @@
 	{#if loading && options.length === 0 && isRouter}
 		<div class="flex items-center gap-2 text-xs text-muted-foreground">
 			<Loader2 class="h-3.5 w-3.5 animate-spin" />
+
 			Loading models…
 		</div>
 	{:else if options.length === 0 && isRouter}
-		<p class="text-xs text-muted-foreground">No models available.</p>
+		{#if currentModel}
+			<span
+				class={cn(
+					'inline-flex items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs text-muted-foreground',
+					className
+				)}
+				style="max-width: min(calc(100cqw - 9rem), 20rem)"
+			>
+				<Package class="h-3.5 w-3.5" />
+
+				<ModelId modelId={currentModel} class="min-w-0" showOrgName />
+			</span>
+		{:else}
+			<p class="text-xs text-muted-foreground">No models available.</p>
+		{/if}
 	{:else}
 		{@const selectedOption = getDisplayOption()}
 
@@ -280,7 +350,7 @@
 						type="button"
 						class={cn(
 							`inline-grid cursor-pointer grid-cols-[1fr_auto_1fr] items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
-							!isCurrentModelInCache()
+							!isCurrentModelInCache
 								? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400'
 								: forceForegroundText
 									? 'text-foreground'
@@ -294,12 +364,21 @@
 					>
 						<Package class="h-3.5 w-3.5" />
 
-						<TruncatedText
-							text={selectedOption?.model || 'Select model'}
-							class="min-w-0 font-medium"
-						/>
+						{#if selectedOption}
+							<Tooltip.Root>
+								<Tooltip.Trigger class="min-w-0 overflow-hidden">
+									<ModelId modelId={selectedOption.model} class="min-w-0" showOrgName />
+								</Tooltip.Trigger>
 
-						{#if updating}
+								<Tooltip.Content>
+									<p class="font-mono">{selectedOption.model}</p>
+								</Tooltip.Content>
+							</Tooltip.Root>
+						{:else}
+							<span class="min-w-0 font-medium">Select model</span>
+						{/if}
+
+						{#if updating || isLoadingModel}
 							<Loader2 class="h-3 w-3.5 animate-spin" />
 						{:else}
 							<ChevronDown class="h-3 w-3.5" />
@@ -316,10 +395,10 @@
 						placeholder="Search models..."
 						onSearchKeyDown={handleSearchKeyDown}
 						emptyMessage="No models found."
-						isEmpty={filteredOptions.length === 0 && isCurrentModelInCache()}
+						isEmpty={filteredOptions.length === 0 && isCurrentModelInCache}
 					>
 						<div class="models-list">
-							{#if !isCurrentModelInCache() && currentModel}
+							{#if !isCurrentModelInCache && currentModel}
 								<!-- Show unavailable model as first option (disabled) -->
 								<button
 									type="button"
@@ -329,90 +408,54 @@
 									aria-disabled="true"
 									disabled
 								>
-									<span
-										class="min-w-0 flex-1 truncate text-left sm:overflow-visible sm:text-clip sm:whitespace-nowrap"
-									>
-										{selectedOption?.name || currentModel}
-									</span>
+									<ModelId modelId={currentModel} class="flex-1" showOrgName />
+
 									<span class="ml-2 text-xs whitespace-nowrap opacity-70">(not available)</span>
 								</button>
-								<div class="my-1 h-px bg-border"></div>
 							{/if}
+
 							{#if filteredOptions.length === 0}
 								<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
 							{/if}
-							{#each filteredOptions as option, index (option.id)}
-								{@const status = getModelStatus(option.model)}
-								{@const isLoaded = status === ServerModelStatus.LOADED}
-								{@const isLoading = status === ServerModelStatus.LOADING}
-								{@const isSelected = currentModel === option.model || activeId === option.id}
-								{@const isHighlighted = index === highlightedIndex}
-
-								<div
-									class={cn(
-										'group flex w-full items-center gap-2 rounded-sm p-2 text-left text-sm transition focus:outline-none',
-										'cursor-pointer hover:bg-muted focus:bg-muted',
-										isSelected || isHighlighted
-											? 'bg-accent text-accent-foreground'
-											: 'hover:bg-accent hover:text-accent-foreground',
-										isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
-									)}
-									role="option"
-									aria-selected={isSelected || isHighlighted}
-									tabindex="0"
-									onclick={() => handleSelect(option.id)}
-									onmouseenter={() => (highlightedIndex = index)}
-									onkeydown={(e) => {
-										if (e.key === 'Enter' || e.key === ' ') {
-											e.preventDefault();
-											handleSelect(option.id);
-										}
-									}}
-								>
-									<span
-										class="min-w-0 flex-1 truncate text-left sm:overflow-visible sm:pr-2 sm:text-clip sm:whitespace-nowrap"
+
+							{#each groupedFilteredOptions as group (group.isLoadedGroup ? '__loaded__' : group.isFavouritesGroup ? '__favourites__' : group.orgName)}
+								{#if group.isLoadedGroup}
+									<p class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none">
+										Loaded models
+									</p>
+								{:else if group.isFavouritesGroup}
+									<p class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none">
+										Favourite models
+									</p>
+								{:else if group.orgName}
+									<p
+										class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none [&:not(:first-child)]:mt-2"
 									>
-										{option.model}
-									</span>
-
-									<div class="flex w-6 shrink-0 justify-center">
-										{#if isLoading}
-											<Tooltip.Root>
-												<Tooltip.Trigger>
-													<Loader2 class="h-4 w-4 animate-spin text-muted-foreground" />
-												</Tooltip.Trigger>
-												<Tooltip.Content class="z-[9999]">
-													<p>Loading model...</p>
-												</Tooltip.Content>
-											</Tooltip.Root>
-										{:else if isLoaded}
-											<Tooltip.Root>
-												<Tooltip.Trigger>
-													<button
-														type="button"
-														class="relative flex h-4 w-4 items-center justify-center"
-														onclick={(e) => {
-															e.stopPropagation();
-															modelsStore.unloadModel(option.model);
-														}}
-													>
-														<span
-															class="h-2 w-2 rounded-full bg-green-500 transition-opacity group-hover:opacity-0"
-														></span>
-														<Power
-															class="absolute h-4 w-4 text-red-500 opacity-0 transition-opacity group-hover:opacity-100 hover:text-red-600"
-														/>
-													</button>
-												</Tooltip.Trigger>
-												<Tooltip.Content class="z-[9999]">
-													<p>Unload model</p>
-												</Tooltip.Content>
-											</Tooltip.Root>
-										{:else}
-											<span class="h-2 w-2 rounded-full bg-muted-foreground/50"></span>
-										{/if}
-									</div>
-								</div>
+										{group.orgName}
+									</p>
+								{/if}
+
+								{#each group.items as { option, flatIndex } (group.isLoadedGroup ? `loaded-${option.id}` : group.isFavouritesGroup ? `fav-${option.id}` : option.id)}
+									{@const isSelected = currentModel === option.model || activeId === option.id}
+									{@const isHighlighted = flatIndex === highlightedIndex}
+									{@const isFav = modelsStore.favouriteModelIds.has(option.model)}
+
+									<ModelsSelectorOption
+										{option}
+										{isSelected}
+										{isHighlighted}
+										{isFav}
+										showOrgName={group.isFavouritesGroup || group.isLoadedGroup}
+										onSelect={handleSelect}
+										onMouseEnter={() => (highlightedIndex = flatIndex)}
+										onKeyDown={(e) => {
+											if (e.key === KeyboardKey.ENTER || e.key === KeyboardKey.SPACE) {
+												e.preventDefault();
+												handleSelect(option.id);
+											}
+										}}
+									/>
+								{/each}
 							{/each}
 						</div>
 					</DropdownMenuSearchable>
@@ -422,7 +465,7 @@
 			<button
 				class={cn(
 					`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
-					!isCurrentModelInCache()
+					!isCurrentModelInCache
 						? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400'
 						: forceForegroundText
 							? 'text-foreground'
@@ -437,7 +480,17 @@
 			>
 				<Package class="h-3.5 w-3.5" />
 
-				<TruncatedText text={selectedOption?.model || ''} class="min-w-0 font-medium" />
+				{#if selectedOption}
+					<Tooltip.Root>
+						<Tooltip.Trigger class="min-w-0 overflow-hidden">
+							<ModelId modelId={selectedOption.model} class="min-w-0" showOrgName />
+						</Tooltip.Trigger>
+
+						<Tooltip.Content>
+							<p class="font-mono">{selectedOption.model}</p>
+						</Tooltip.Content>
+					</Tooltip.Root>
+				{/if}
 
 				{#if updating}
 					<Loader2 class="h-3 w-3.5 animate-spin" />
diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorOption.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelectorOption.svelte
new file mode 100644
index 00000000000..d4239fb1a13
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/models/ModelsSelectorOption.svelte
@@ -0,0 +1,143 @@
+<script lang="ts">
+	import { CircleAlert, Heart, HeartOff, Loader2, Power, PowerOff, RotateCw } from '@lucide/svelte';
+	import { cn } from '$lib/components/ui/utils';
+	import { ActionIcon, ModelId } from '$lib/components/app';
+	import type { ModelOption } from '$lib/types/models';
+	import { ServerModelStatus } from '$lib/enums';
+	import { modelsStore, routerModels } from '$lib/stores/models.svelte';
+
+	interface Props {
+		option: ModelOption;
+		isSelected: boolean;
+		isHighlighted: boolean;
+		isFav: boolean;
+		showOrgName?: boolean;
+		onSelect: (modelId: string) => void;
+		onMouseEnter: () => void;
+		onKeyDown: (e: KeyboardEvent) => void;
+	}
+
+	let {
+		option,
+		isSelected,
+		isHighlighted,
+		isFav,
+		showOrgName = false,
+		onSelect,
+		onMouseEnter,
+		onKeyDown
+	}: Props = $props();
+
+	let currentRouterModels = $derived(routerModels());
+	let serverStatus = $derived.by(() => {
+		const model = currentRouterModels.find((m) => m.id === option.model);
+		return (model?.status?.value as ServerModelStatus) ?? null;
+	});
+	let isOperationInProgress = $derived(modelsStore.isModelOperationInProgress(option.model));
+	let isFailed = $derived(serverStatus === ServerModelStatus.FAILED);
+	let isLoaded = $derived(serverStatus === ServerModelStatus.LOADED && !isOperationInProgress);
+	let isLoading = $derived(serverStatus === ServerModelStatus.LOADING || isOperationInProgress);
+</script>
+
+<div
+	class={cn(
+		'group flex w-full items-center gap-2 rounded-sm p-2 text-left text-sm transition focus:outline-none',
+		'cursor-pointer hover:bg-muted focus:bg-muted',
+		isSelected || isHighlighted
+			? 'bg-accent text-accent-foreground'
+			: 'hover:bg-accent hover:text-accent-foreground',
+		isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
+	)}
+	role="option"
+	aria-selected={isSelected || isHighlighted}
+	tabindex="0"
+	onclick={() => onSelect(option.id)}
+	onmouseenter={onMouseEnter}
+	onkeydown={onKeyDown}
+>
+	<ModelId
+		modelId={option.model}
+		{showOrgName}
+		aliases={option.aliases}
+		tags={option.tags}
+		class="flex-1"
+	/>
+
+	<div class="flex shrink-0 items-center gap-2.5">
+		<!-- svelte-ignore a11y_no_static_element_interactions -->
+		<!-- svelte-ignore a11y_click_events_have_key_events -->
+		<div
+			class="pointer-events-none flex w-4 items-center justify-center pl-2 opacity-0 group-hover:pointer-events-auto group-hover:opacity-100"
+			onclick={(e) => e.stopPropagation()}
+		>
+			{#if isFav}
+				<ActionIcon
+					iconSize="h-2.5 w-2.5"
+					icon={HeartOff}
+					tooltip="Remove from favourites"
+					class="h-3 w-3 hover:text-foreground"
+					onclick={() => modelsStore.toggleFavourite(option.model)}
+				/>
+			{:else}
+				<ActionIcon
+					iconSize="h-2.5 w-2.5"
+					icon={Heart}
+					tooltip="Add to favourites"
+					class="h-3 w-3 hover:text-foreground"
+					onclick={() => modelsStore.toggleFavourite(option.model)}
+				/>
+			{/if}
+		</div>
+		{#if isLoading}
+			<Loader2 class="h-4 w-4 animate-spin text-muted-foreground" />
+		{:else if isFailed}
+			<div class="flex w-4 items-center justify-center">
+				<CircleAlert class="h-3.5 w-3.5 text-red-500 group-hover:hidden" />
+
+				<!-- svelte-ignore a11y_no_static_element_interactions -->
+				<!-- svelte-ignore a11y_click_events_have_key_events -->
+				<div class="hidden group-hover:flex" onclick={(e) => e.stopPropagation()}>
+					<ActionIcon
+						iconSize="h-2.5 w-2.5"
+						icon={RotateCw}
+						tooltip="Retry loading model"
+						class="h-3 w-3 text-red-500 hover:text-foreground"
+						onclick={() => modelsStore.loadModel(option.model)}
+					/>
+				</div>
+			</div>
+		{:else if isLoaded}
+			<div class="flex w-4 items-center justify-center">
+				<span class="h-2 w-2 rounded-full bg-green-500 group-hover:hidden"></span>
+
+				<!-- svelte-ignore a11y_no_static_element_interactions -->
+				<!-- svelte-ignore a11y_click_events_have_key_events -->
+				<div class="hidden group-hover:flex" onclick={(e) => e.stopPropagation()}>
+					<ActionIcon
+						iconSize="h-2.5 w-2.5"
+						icon={PowerOff}
+						tooltip="Unload model"
+						class="h-3 w-3 text-red-500 hover:text-red-600"
+						onclick={() => modelsStore.unloadModel(option.model)}
+					/>
+				</div>
+			</div>
+		{:else}
+			<div class="flex w-4 items-center justify-center">
+				<span class="h-2 w-2 rounded-full bg-muted-foreground/50 group-hover:hidden"></span>
+
+				<!-- svelte-ignore a11y_no_static_element_interactions -->
+				<!-- svelte-ignore a11y_click_events_have_key_events -->
+				<div class="hidden group-hover:flex" onclick={(e) => e.stopPropagation()}>
+					<ActionIcon
+						iconSize="h-2.5 w-2.5"
+						icon={Power}
+						tooltip="Load model"
+						class="h-3 w-3"
+						onclick={() => modelsStore.loadModel(option.model)}
+					/>
+				</div>
+			</div>
+		{/if}
+	</div>
+</div>
diff --git a/tools/server/webui/src/lib/components/app/models/index.ts b/tools/server/webui/src/lib/components/app/models/index.ts
index bb3710d30a7..aadc86cdad2 100644
--- a/tools/server/webui/src/lib/components/app/models/index.ts
+++ b/tools/server/webui/src/lib/components/app/models/index.ts
@@ -71,3 +71,5 @@ export { default as ModelsSelector } from './ModelsSelector.svelte';
  * ```
  */
 export { default as ModelBadge } from './ModelBadge.svelte';
+export { default as ModelId } from './ModelId.svelte';
+export { default as ModelsSelectorOption } from './ModelsSelectorOption.svelte';
diff --git a/tools/server/webui/src/lib/components/ui/badge/badge.svelte b/tools/server/webui/src/lib/components/ui/badge/badge.svelte
index c3e6ac0720e..9fbf0b80a59 100644
--- a/tools/server/webui/src/lib/components/ui/badge/badge.svelte
+++ b/tools/server/webui/src/lib/components/ui/badge/badge.svelte
@@ -8,6 +8,8 @@
 				default: 'bg-primary text-primary-foreground [a&]:hover:bg-primary/90 border-transparent',
 				secondary:
 					'bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90 border-transparent',
+				tertiary:
+					'bg-foreground/15 dark:bg-foreground/10 text-foreground [a&]:hover:bg-foreground/25 border-transparent',
 				destructive:
 					'bg-destructive [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/70 border-transparent text-white',
 				outline: 'text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground'
diff --git a/tools/server/webui/src/lib/components/ui/card/card.svelte b/tools/server/webui/src/lib/components/ui/card/card.svelte
index b9dcd2de6f4..d0a57d0c970 100644
--- a/tools/server/webui/src/lib/components/ui/card/card.svelte
+++ b/tools/server/webui/src/lib/components/ui/card/card.svelte
@@ -1,7 +1,7 @@
 <script lang="ts">
 	import type { HTMLAttributes } from 'svelte/elements';
 	import { cn, type WithElementRef } from '$lib/components/ui/utils';
-	import { BOX_BORDER } from '$lib/constants/css-classes';
+	import { BOX_BORDER } from '$lib/constants';
 
 	let {
 		ref = $bindable(null),
diff --git a/tools/server/webui/src/lib/constants/api-endpoints.ts b/tools/server/webui/src/lib/constants/api-endpoints.ts
new file mode 100644
index 00000000000..f044241932b
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/api-endpoints.ts
@@ -0,0 +1,5 @@
+export const API_MODELS = {
+	LIST: '/v1/models',
+	LOAD: '/models/load',
+	UNLOAD: '/models/unload'
+};
diff --git a/tools/server/webui/src/lib/constants/index.ts b/tools/server/webui/src/lib/constants/index.ts
new file mode 100644
index 00000000000..d80dc8f091a
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/index.ts
@@ -0,0 +1,33 @@
+// Central constants export file
+// All constants should be imported from '$lib/constants'
+
+export * from './agentic';
+export * from './api-endpoints';
+export * from './attachment-labels';
+export * from './auto-scroll';
+export * from './binary-detection';
+export * from './cache';
+export * from './chat-form';
+export * from './code-blocks';
+export * from './code';
+export * from './css-classes';
+export * from './floating-ui-constraints';
+export * from './formatters';
+export * from './icons';
+export * from './latex-protection';
+export * from './literal-html';
+export * from './localstorage-keys';
+export * from './markdown';
+export * from './max-bundle-size';
+export * from './model-id';
+export * from './precision';
+export * from './processing-info';
+export * from './settings-config';
+export * from './settings-fields';
+export * from './settings-keys';
+export * from './settings-sections';
+export * from './supported-file-types';
+export * from './table-html-restorer';
+export * from './tooltip-config';
+export * from './ui';
+export * from './viewport';
diff --git a/tools/server/webui/src/lib/constants/localstorage-keys.ts b/tools/server/webui/src/lib/constants/localstorage-keys.ts
index 919b6ea06d3..6b9a9e0e2f8 100644
--- a/tools/server/webui/src/lib/constants/localstorage-keys.ts
+++ b/tools/server/webui/src/lib/constants/localstorage-keys.ts
@@ -1,2 +1,3 @@
 export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config';
 export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides';
+export const FAVOURITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favouriteModels';
diff --git a/tools/server/webui/src/lib/constants/model-id.ts b/tools/server/webui/src/lib/constants/model-id.ts
new file mode 100644
index 00000000000..eb6662a02da
--- /dev/null
+++ b/tools/server/webui/src/lib/constants/model-id.ts
@@ -0,0 +1,28 @@
+/** Sentinel value returned by `indexOf` when a substring is not found. */
+export const MODEL_ID_NOT_FOUND = -1;
+
+/** Separates `<org>` from `<model>` in a model ID, e.g. `org/ModelName`. */
+export const MODEL_ID_ORG_SEPARATOR = '/';
+
+/** Separates named segments within the model path, e.g. `ModelName-7B-GGUF`. */
+export const MODEL_ID_SEGMENT_SEPARATOR = '-';
+
+/** Separates the model path from the quantization tag, e.g. `model:Q4_K_M`. */
+export const MODEL_ID_QUANTIZATION_SEPARATOR = ':';
+
+/**
+ * Matches a trailing ALL-CAPS format segment, e.g. `GGUF`, `BF16`, `Q4_K_M`.
+ * Must be at least 2 uppercase letters, optionally followed by uppercase letters or digits.
+ */
+export const MODEL_FORMAT_SEGMENT_RE = /^[A-Z]{2,}[A-Z0-9]*$/;
+
+/**
+ * Matches a parameter-count segment, e.g. `7B`, `1.5b`, `120M`.
+ */
+export const MODEL_PARAMS_RE = /^\d+(\.\d+)?[BbMmKkTt]$/;
+
+/**
+ * Matches an activated-parameter-count segment, e.g. `A10B`, `A2.4b`.
+ * The leading `A` distinguishes it from a regular params segment.
+ */
+export const MODEL_ACTIVATED_PARAMS_RE = /^A\d+(\.\d+)?[BbMmKkTt]$/;
diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts
index 00dac3d6e9a..9f3f06527e7 100644
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -23,6 +23,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
 	autoShowSidebarOnNewChat: true,
 	autoMicOnEmpty: false,
 	fullHeightCodeBlocks: false,
+	showRawModelNames: false,
 	// make sure these default values are in sync with `common.h`
 	samplers: 'top_k;typ_p;top_p;min_p;temperature',
 	backend_sampling: false,
@@ -116,6 +117,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 		'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
 	fullHeightCodeBlocks:
 		'Always display code blocks at their full natural height, overriding any height limits.',
+	showRawModelNames:
+		'Display full raw model identifiers (e.g. "unsloth/Qwen3.5-27B-GGUF:BF16") instead of parsed names with badges.',
 	pyInterpreterEnabled:
 		'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
 	enableContinueGeneration:
diff --git a/tools/server/webui/src/lib/constants/settings-keys.ts b/tools/server/webui/src/lib/constants/settings-keys.ts
index 38de41ffee3..0f531cb1e02 100644
--- a/tools/server/webui/src/lib/constants/settings-keys.ts
+++ b/tools/server/webui/src/lib/constants/settings-keys.ts
@@ -24,6 +24,7 @@ export const SETTINGS_KEYS = {
 	ALWAYS_SHOW_SIDEBAR_ON_DESKTOP: 'alwaysShowSidebarOnDesktop',
 	AUTO_SHOW_SIDEBAR_ON_NEW_CHAT: 'autoShowSidebarOnNewChat',
 	FULL_HEIGHT_CODE_BLOCKS: 'fullHeightCodeBlocks',
+	SHOW_RAW_MODEL_NAMES: 'showRawModelNames',
 	// Sampling
 	TEMPERATURE: 'temperature',
 	DYNATEMP_RANGE: 'dynatemp_range',
diff --git a/tools/server/webui/src/lib/enums/keyboard.ts b/tools/server/webui/src/lib/enums/keyboard.ts
index b8f6d5f7a28..bea68d790b5 100644
--- a/tools/server/webui/src/lib/enums/keyboard.ts
+++ b/tools/server/webui/src/lib/enums/keyboard.ts
@@ -11,5 +11,6 @@ export enum KeyboardKey {
 	D_UPPER = 'D',
 	E_UPPER = 'E',
 	K_LOWER = 'k',
-	O_UPPER = 'O'
+	O_UPPER = 'O',
+	SPACE = ' '
 }
diff --git a/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts b/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts
index 22c74f4a6f0..6454fc5b58a 100644
--- a/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts
+++ b/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts
@@ -1,4 +1,4 @@
-import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants/viewport';
+import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants';
 import { MediaQuery } from 'svelte/reactivity';
 
 export class IsMobile extends MediaQuery {
diff --git a/tools/server/webui/src/lib/hooks/use-auto-scroll.svelte.ts b/tools/server/webui/src/lib/hooks/use-auto-scroll.svelte.ts
index bbaa5d1362b..afd0aad8301 100644
--- a/tools/server/webui/src/lib/hooks/use-auto-scroll.svelte.ts
+++ b/tools/server/webui/src/lib/hooks/use-auto-scroll.svelte.ts
@@ -1,4 +1,4 @@
-import { AUTO_SCROLL_AT_BOTTOM_THRESHOLD, AUTO_SCROLL_INTERVAL } from '$lib/constants/auto-scroll';
+import { AUTO_SCROLL_AT_BOTTOM_THRESHOLD, AUTO_SCROLL_INTERVAL } from '$lib/constants';
 
 export interface AutoScrollOptions {
 	/** Whether auto-scroll is disabled globally (e.g., from settings) */
diff --git a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
index 1205d9b9738..f280319723f 100644
--- a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
+++ b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
@@ -1,6 +1,6 @@
 import { activeProcessingState } from '$lib/stores/chat.svelte';
 import { config } from '$lib/stores/settings.svelte';
-import { STATS_UNITS } from '$lib/constants/processing-info';
+import { STATS_UNITS } from '$lib/constants';
 import type { ApiProcessingState, LiveProcessingStats, LiveGenerationStats } from '$lib/types';
 
 export interface UseProcessingStateReturn {
diff --git a/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
index 168de974037..7c3b7f167f0 100644
--- a/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
+++ b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
@@ -22,7 +22,7 @@ import {
 	COPY_CODE_BTN_CLASS,
 	PREVIEW_CODE_BTN_CLASS,
 	RELATIVE_CLASS
-} from '$lib/constants/code-blocks';
+} from '$lib/constants';
 
 declare global {
 	interface Window {
diff --git a/tools/server/webui/src/lib/markdown/literal-html.ts b/tools/server/webui/src/lib/markdown/literal-html.ts
index d4ace01afe4..c974d8b1893 100644
--- a/tools/server/webui/src/lib/markdown/literal-html.ts
+++ b/tools/server/webui/src/lib/markdown/literal-html.ts
@@ -1,7 +1,7 @@
 import type { Plugin } from 'unified';
 import { visit } from 'unist-util-visit';
 import type { Break, Content, Paragraph, PhrasingContent, Root, Text } from 'mdast';
-import { LINE_BREAK, NBSP, PHRASE_PARENTS, TAB_AS_SPACES } from '$lib/constants/literal-html';
+import { LINE_BREAK, NBSP, PHRASE_PARENTS, TAB_AS_SPACES } from '$lib/constants';
 
 /**
  * remark plugin that rewrites raw HTML nodes into plain-text equivalents.
diff --git a/tools/server/webui/src/lib/markdown/table-html-restorer.ts b/tools/server/webui/src/lib/markdown/table-html-restorer.ts
index 918aa468116..bc5d034653f 100644
--- a/tools/server/webui/src/lib/markdown/table-html-restorer.ts
+++ b/tools/server/webui/src/lib/markdown/table-html-restorer.ts
@@ -68,7 +68,7 @@ import type { Plugin } from 'unified';
 import type { Element, ElementContent, Root, Text } from 'hast';
 import { visit } from 'unist-util-visit';
 import { visitParents } from 'unist-util-visit-parents';
-import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
+import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants';
 
 /**
  * Expands text containing `<br>` tags into an array of text nodes and br elements.
diff --git a/tools/server/webui/src/lib/services/chat.service.ts b/tools/server/webui/src/lib/services/chat.service.ts
index 71844946c57..ebddfe2e02e 100644
--- a/tools/server/webui/src/lib/services/chat.service.ts
+++ b/tools/server/webui/src/lib/services/chat.service.ts
@@ -1,5 +1,5 @@
 import { getJsonHeaders, formatAttachmentText, isAbortError } from '$lib/utils';
-import { ATTACHMENT_LABEL_PDF_FILE } from '$lib/constants/attachment-labels';
+import { AGENTIC_REGEX, ATTACHMENT_LABEL_PDF_FILE } from '$lib/constants';
 import {
 	AttachmentType,
 	ContentPartType,
@@ -9,7 +9,6 @@ import {
 } from '$lib/enums';
 import type { ApiChatMessageContentPart, ApiChatCompletionToolCall } from '$lib/types/api';
 import { modelsStore } from '$lib/stores/models.svelte';
-import { AGENTIC_REGEX } from '$lib/constants/agentic';
 
 export class ChatService {
 	private static stripReasoningContent(
diff --git a/tools/server/webui/src/lib/services/models.service.ts b/tools/server/webui/src/lib/services/models.service.ts
index 347f1718465..de90c48cf00 100644
--- a/tools/server/webui/src/lib/services/models.service.ts
+++ b/tools/server/webui/src/lib/services/models.service.ts
@@ -1,5 +1,16 @@
 import { ServerModelStatus } from '$lib/enums';
 import { apiFetch, apiPost } from '$lib/utils';
+import type { ParsedModelId } from '$lib/types/models';
+import {
+	MODEL_FORMAT_SEGMENT_RE,
+	MODEL_PARAMS_RE,
+	MODEL_ACTIVATED_PARAMS_RE,
+	MODEL_ID_NOT_FOUND,
+	MODEL_ID_ORG_SEPARATOR,
+	MODEL_ID_SEGMENT_SEPARATOR,
+	MODEL_ID_QUANTIZATION_SEPARATOR,
+	API_MODELS
+} from '$lib/constants';
 
 export class ModelsService {
 	/**
@@ -17,7 +28,7 @@ export class ModelsService {
 	 * @returns List of available models with basic metadata
 	 */
 	static async list(): Promise<ApiModelListResponse> {
-		return apiFetch<ApiModelListResponse>('/v1/models');
+		return apiFetch<ApiModelListResponse>(API_MODELS.LIST);
 	}
 
 	/**
@@ -28,7 +39,7 @@ export class ModelsService {
 	 * @returns List of models with detailed status and configuration info
 	 */
 	static async listRouter(): Promise<ApiRouterModelsListResponse> {
-		return apiFetch<ApiRouterModelsListResponse>('/v1/models');
+		return apiFetch<ApiRouterModelsListResponse>(API_MODELS.LIST);
 	}
 
 	/**
@@ -54,7 +65,7 @@ export class ModelsService {
 			payload.extra_args = extraArgs;
 		}
 
-		return apiPost<ApiRouterModelsLoadResponse>('/models/load', payload);
+		return apiPost<ApiRouterModelsLoadResponse>(API_MODELS.LOAD, payload);
 	}
 
 	/**
@@ -66,7 +77,7 @@ export class ModelsService {
 	 * @returns Unload response from the server
 	 */
 	static async unload(modelId: string): Promise<ApiRouterModelsUnloadResponse> {
-		return apiPost<ApiRouterModelsUnloadResponse>('/models/unload', { model: modelId });
+		return apiPost<ApiRouterModelsUnloadResponse>(API_MODELS.UNLOAD, { model: modelId });
 	}
 
 	/**
@@ -96,4 +107,89 @@ export class ModelsService {
 	static isModelLoading(model: ApiModelDataEntry): boolean {
 		return model.status.value === ServerModelStatus.LOADING;
 	}
+
+	/**
+	 *
+	 *
+	 * Parsing
+	 *
+	 *
+	 */
+
+	/**
+	 * Parse a model ID string into its structured components.
+	 *
+	 * Handles the convention:
+	 *   `<org>/<ModelName>-<Parameters>(-<ActivatedParameters>)-<Format>:<QuantizationType>`
+	 *
+	 * @param modelId - Raw model identifier string
+	 * @returns Structured {@link ParsedModelId} with all detected fields
+	 */
+	static parseModelId(modelId: string): ParsedModelId {
+		const result: ParsedModelId = {
+			raw: modelId,
+			orgName: null,
+			modelName: null,
+			params: null,
+			activatedParams: null,
+			format: null,
+			quantization: null,
+			tags: []
+		};
+
+		const colonIdx = modelId.indexOf(MODEL_ID_QUANTIZATION_SEPARATOR);
+		let modelPath: string;
+
+		if (colonIdx !== MODEL_ID_NOT_FOUND) {
+			result.quantization = modelId.slice(colonIdx + 1) || null;
+			modelPath = modelId.slice(0, colonIdx);
+		} else {
+			modelPath = modelId;
+		}
+
+		const slashIdx = modelPath.indexOf(MODEL_ID_ORG_SEPARATOR);
+		let modelStr: string;
+
+		if (slashIdx !== MODEL_ID_NOT_FOUND) {
+			result.orgName = modelPath.slice(0, slashIdx);
+			modelStr = modelPath.slice(slashIdx + 1);
+		} else {
+			modelStr = modelPath;
+		}
+
+		const segments = modelStr.split(MODEL_ID_SEGMENT_SEPARATOR);
+
+		if (segments.length > 0 && MODEL_FORMAT_SEGMENT_RE.test(segments[segments.length - 1])) {
+			result.format = segments.pop()!;
+		}
+
+		const paramsRe = MODEL_PARAMS_RE;
+		const activatedParamsRe = MODEL_ACTIVATED_PARAMS_RE;
+
+		let paramsIdx = MODEL_ID_NOT_FOUND;
+		let activatedParamsIdx = MODEL_ID_NOT_FOUND;
+
+		for (let i = 0; i < segments.length; i++) {
+			const seg = segments[i];
+			if (paramsIdx === -1 && paramsRe.test(seg)) {
+				paramsIdx = i;
+				result.params = seg.toUpperCase();
+			} else if (activatedParamsRe.test(seg)) {
+				activatedParamsIdx = i;
+				result.activatedParams = seg.toUpperCase();
+			}
+		}
+
+		const pivotIdx = paramsIdx !== MODEL_ID_NOT_FOUND ? paramsIdx : segments.length;
+
+		result.modelName = segments.slice(0, pivotIdx).join(MODEL_ID_SEGMENT_SEPARATOR) || null;
+
+		if (paramsIdx !== MODEL_ID_NOT_FOUND) {
+			result.tags = segments
+				.slice(paramsIdx + 1)
+				.filter((_, relIdx) => paramsIdx + 1 + relIdx !== activatedParamsIdx);
+		}
+
+		return result;
+	}
 }
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
index 66d6eaf0d30..cad82b8472b 100644
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -28,12 +28,12 @@ import {
 	findLeafNode,
 	isAbortError
 } from '$lib/utils';
-import { SYSTEM_MESSAGE_PLACEHOLDER } from '$lib/constants/ui';
-import { REASONING_TAGS } from '$lib/constants/agentic';
 import {
 	MAX_INACTIVE_CONVERSATION_STATES,
-	INACTIVE_CONVERSATION_STATE_MAX_AGE_MS
-} from '$lib/constants/cache';
+	INACTIVE_CONVERSATION_STATE_MAX_AGE_MS,
+	REASONING_TAGS,
+	SYSTEM_MESSAGE_PLACEHOLDER
+} from '$lib/constants';
 import type {
 	ChatMessageTimings,
 	ChatMessagePromptProgress,
diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts
index c4cc3d38606..a6d7d6572ff 100644
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -1,9 +1,14 @@
-import { SvelteSet } from 'svelte/reactivity';
+import { SvelteMap, SvelteSet } from 'svelte/reactivity';
+import { toast } from 'svelte-sonner';
 import { ServerModelStatus, ModelModality } from '$lib/enums';
 import { ModelsService, PropsService } from '$lib/services';
 import { serverStore } from '$lib/stores/server.svelte';
 import { TTLCache } from '$lib/utils';
-import { MODEL_PROPS_CACHE_TTL_MS, MODEL_PROPS_CACHE_MAX_ENTRIES } from '$lib/constants/cache';
+import {
+	MODEL_PROPS_CACHE_TTL_MS,
+	MODEL_PROPS_CACHE_MAX_ENTRIES,
+	FAVOURITE_MODELS_LOCALSTORAGE_KEY
+} from '$lib/constants';
 
 /**
  * modelsStore - Reactive store for model management in both MODEL and ROUTER modes
@@ -50,7 +55,9 @@ class ModelsStore {
 	selectedModelName = $state<string | null>(null);
 
 	private modelUsage = $state<Map<string, SvelteSet<string>>>(new Map());
-	private modelLoadingStates = $state<Map<string, boolean>>(new Map());
+	private modelLoadingStates = new SvelteMap<string, boolean>();
+
+	favouriteModelIds = $state<Set<string>>(this.loadFavouritesFromStorage());
 
 	/**
 	 * Model-specific props cache with TTL
@@ -261,15 +268,19 @@ class ModelsStore {
 				const displayNameSource =
 					details?.name && details.name.trim().length > 0 ? details.name : item.id;
 				const displayName = this.toDisplayName(displayNameSource);
+				const modelId = details?.model || item.id;
 
 				return {
 					id: item.id,
 					name: displayName,
-					model: details?.model || item.id,
+					model: modelId,
 					description: details?.description,
 					capabilities: rawCapabilities.filter((value: unknown): value is string => Boolean(value)),
 					details: details?.details,
-					meta: item.meta ?? null
+					meta: item.meta ?? null,
+					parsedId: ModelsService.parseModelId(modelId),
+					aliases: item.aliases ?? [],
+					tags: item.tags ?? []
 				} satisfies ModelOption;
 			});
 
@@ -497,22 +508,21 @@ class ModelsStore {
 
 	/** Polling interval in ms for checking model status */
 	private static readonly STATUS_POLL_INTERVAL = 500;
-	/** Maximum polling attempts before giving up */
-	private static readonly STATUS_POLL_MAX_ATTEMPTS = 60; // 30 seconds max
 
 	/**
 	 * Poll for expected model status after load/unload operation.
-	 * Keeps polling until the model reaches the expected status or max attempts reached.
+	 * Keeps polling indefinitely until the model reaches the expected status or fails.
 	 *
 	 * @param modelId - Model identifier to check
 	 * @param expectedStatus - Expected status to wait for
-	 * @returns Promise that resolves when expected status is reached
+	 * @throws Error if model reaches FAILED status
 	 */
 	private async pollForModelStatus(
 		modelId: string,
 		expectedStatus: ServerModelStatus
 	): Promise<void> {
-		for (let attempt = 0; attempt < ModelsStore.STATUS_POLL_MAX_ATTEMPTS; attempt++) {
+		let attempt = 0;
+		while (true) {
 			await this.fetchRouterModels();
 
 			const currentStatus = this.getModelStatus(modelId);
@@ -520,12 +530,23 @@ class ModelsStore {
 				return;
 			}
 
+			if (currentStatus === ServerModelStatus.FAILED) {
+				throw new Error(
+					`Model failed to ${expectedStatus === ServerModelStatus.LOADED ? 'load' : 'unload'}`
+				);
+			}
+
+			if (
+				expectedStatus === ServerModelStatus.LOADED &&
+				currentStatus === ServerModelStatus.UNLOADED &&
+				attempt > 2
+			) {
+				throw new Error('Model was unloaded unexpectedly during loading');
+			}
+
+			attempt++;
 			await new Promise((resolve) => setTimeout(resolve, ModelsStore.STATUS_POLL_INTERVAL));
 		}
-
-		console.warn(
-			`Model ${modelId} did not reach expected status ${expectedStatus} after ${ModelsStore.STATUS_POLL_MAX_ATTEMPTS} attempts`
-		);
 	}
 
 	/**
@@ -547,8 +568,10 @@ class ModelsStore {
 			await this.pollForModelStatus(modelId, ServerModelStatus.LOADED);
 
 			await this.updateModelModalities(modelId);
+			toast.success(`Model loaded: ${this.toDisplayName(modelId)}`);
 		} catch (error) {
 			this.error = error instanceof Error ? error.message : 'Failed to load model';
+			toast.error(`Failed to load model: ${this.toDisplayName(modelId)}`);
 			throw error;
 		} finally {
 			this.modelLoadingStates.set(modelId, false);
@@ -573,8 +596,10 @@ class ModelsStore {
 			await ModelsService.unload(modelId);
 
 			await this.pollForModelStatus(modelId, ServerModelStatus.UNLOADED);
+			toast.info(`Model unloaded: ${this.toDisplayName(modelId)}`);
 		} catch (error) {
 			this.error = error instanceof Error ? error.message : 'Failed to unload model';
+			toast.error(`Failed to unload model: ${this.toDisplayName(modelId)}`);
 			throw error;
 		} finally {
 			this.modelLoadingStates.set(modelId, false);
@@ -593,6 +618,48 @@ class ModelsStore {
 		await this.loadModel(modelId);
 	}
 
+	/**
+	 *
+	 *
+	 * Favourites
+	 *
+	 *
+	 */
+
+	isFavourite(modelId: string): boolean {
+		return this.favouriteModelIds.has(modelId);
+	}
+
+	toggleFavourite(modelId: string): void {
+		const next = new SvelteSet(this.favouriteModelIds);
+
+		if (next.has(modelId)) {
+			next.delete(modelId);
+		} else {
+			next.add(modelId);
+		}
+
+		this.favouriteModelIds = next;
+
+		try {
+			localStorage.setItem(FAVOURITE_MODELS_LOCALSTORAGE_KEY, JSON.stringify([...next]));
+		} catch {
+			toast.error('Failed to save favourite models to local storage');
+		}
+	}
+
+	private loadFavouritesFromStorage(): Set<string> {
+		try {
+			const raw = localStorage.getItem(FAVOURITE_MODELS_LOCALSTORAGE_KEY);
+
+			return raw ? new Set(JSON.parse(raw) as string[]) : new Set();
+		} catch {
+			toast.error('Failed to load favourite models from local storage');
+
+			return new Set();
+		}
+	}
+
 	/**
 	 *
 	 *
@@ -646,3 +713,4 @@ export const loadingModelIds = () => modelsStore.loadingModelIds;
 export const propsCacheVersion = () => modelsStore.propsCacheVersion;
 export const singleModelName = () => modelsStore.singleModelName;
 export const selectedModelContextSize = () => modelsStore.selectedModelContextSize;
+export const favouriteModelIds = () => modelsStore.favouriteModelIds;
diff --git a/tools/server/webui/src/lib/stores/settings.svelte.ts b/tools/server/webui/src/lib/stores/settings.svelte.ts
index 68431f4e32e..8ab817c071a 100644
--- a/tools/server/webui/src/lib/stores/settings.svelte.ts
+++ b/tools/server/webui/src/lib/stores/settings.svelte.ts
@@ -32,7 +32,11 @@
  */
 
 import { browser } from '$app/environment';
-import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+import {
+	CONFIG_LOCALSTORAGE_KEY,
+	SETTING_CONFIG_DEFAULT,
+	USER_OVERRIDES_LOCALSTORAGE_KEY
+} from '$lib/constants';
 import { ParameterSyncService } from '$lib/services/parameter-sync.service';
 import { serverStore } from '$lib/stores/server.svelte';
 import {
@@ -41,10 +45,6 @@ import {
 	getConfigValue,
 	setConfigValue
 } from '$lib/utils';
-import {
-	CONFIG_LOCALSTORAGE_KEY,
-	USER_OVERRIDES_LOCALSTORAGE_KEY
-} from '$lib/constants/localstorage-keys';
 
 class SettingsStore {
 	/**
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index 307e3b71d92..c908258427b 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -81,6 +81,10 @@ export interface ApiModelDataEntry {
 	path: string;
 	/** Current status of the model */
 	status: ApiModelStatus;
+	/** Alternative names that resolve to this model */
+	aliases?: string[];
+	/** Informational tags for this model */
+	tags?: string[];
 	/** Legacy meta field (may be present in older responses) */
 	meta?: Record<string, unknown> | null;
 }
diff --git a/tools/server/webui/src/lib/types/models.d.ts b/tools/server/webui/src/lib/types/models.d.ts
index 505867a1f0d..dc8e86485ce 100644
--- a/tools/server/webui/src/lib/types/models.d.ts
+++ b/tools/server/webui/src/lib/types/models.d.ts
@@ -14,6 +14,20 @@ export interface ModelOption {
 	modalities?: ModelModalities;
 	details?: ApiModelDetails['details'];
 	meta?: ApiModelDataEntry['meta'];
+	parsedId?: ParsedModelId;
+	aliases?: string[];
+	tags?: string[];
+}
+
+export interface ParsedModelId {
+	raw: string;
+	orgName: string | null;
+	modelName: string | null;
+	params: string | null;
+	activatedParams: string | null;
+	format: string | null;
+	quantization: string | null;
+	tags: string[];
 }
 
 /**
diff --git a/tools/server/webui/src/lib/types/settings.d.ts b/tools/server/webui/src/lib/types/settings.d.ts
index 303462b2ccb..f9f5a7824ff 100644
--- a/tools/server/webui/src/lib/types/settings.d.ts
+++ b/tools/server/webui/src/lib/types/settings.d.ts
@@ -1,4 +1,4 @@
-import type { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+import type { SETTING_CONFIG_DEFAULT } from '$lib/constants';
 import type { ChatMessagePromptProgress, ChatMessageTimings } from './chat';
 import type { DatabaseMessageExtra } from './database';
 import type { ParameterSource, SyncableParameterType, SettingsFieldType } from '$lib/enums';
diff --git a/tools/server/webui/src/lib/utils/cache-ttl.ts b/tools/server/webui/src/lib/utils/cache-ttl.ts
index 9a69501d0f3..4e414dd5457 100644
--- a/tools/server/webui/src/lib/utils/cache-ttl.ts
+++ b/tools/server/webui/src/lib/utils/cache-ttl.ts
@@ -1,4 +1,4 @@
-import { DEFAULT_CACHE_TTL_MS, DEFAULT_CACHE_MAX_ENTRIES } from '$lib/constants/cache';
+import { DEFAULT_CACHE_TTL_MS, DEFAULT_CACHE_MAX_ENTRIES } from '$lib/constants';
 
 /**
  * TTL Cache - Time-To-Live cache implementation for memory optimization
diff --git a/tools/server/webui/src/lib/utils/code.ts b/tools/server/webui/src/lib/utils/code.ts
index 67efc6b27e2..d83bc31af3f 100644
--- a/tools/server/webui/src/lib/utils/code.ts
+++ b/tools/server/webui/src/lib/utils/code.ts
@@ -7,7 +7,7 @@ import {
 	LT_REGEX,
 	GT_REGEX,
 	FENCE_PATTERN
-} from '$lib/constants/code';
+} from '$lib/constants';
 
 export interface IncompleteCodeBlock {
 	language: string;
diff --git a/tools/server/webui/src/lib/utils/file-type.ts b/tools/server/webui/src/lib/utils/file-type.ts
index 9a9996d171c..4c670600ce5 100644
--- a/tools/server/webui/src/lib/utils/file-type.ts
+++ b/tools/server/webui/src/lib/utils/file-type.ts
@@ -3,7 +3,7 @@ import {
 	IMAGE_FILE_TYPES,
 	PDF_FILE_TYPES,
 	TEXT_FILE_TYPES
-} from '$lib/constants/supported-file-types';
+} from '$lib/constants';
 import {
 	FileExtensionAudio,
 	FileExtensionImage,
diff --git a/tools/server/webui/src/lib/utils/formatters.ts b/tools/server/webui/src/lib/utils/formatters.ts
index 37a8a3358cf..24a2c1c94c1 100644
--- a/tools/server/webui/src/lib/utils/formatters.ts
+++ b/tools/server/webui/src/lib/utils/formatters.ts
@@ -4,7 +4,7 @@ import {
 	SECONDS_PER_HOUR,
 	SHORT_DURATION_THRESHOLD,
 	MEDIUM_DURATION_THRESHOLD
-} from '$lib/constants/formatters';
+} from '$lib/constants';
 
 /**
  * Formats file size in bytes to human readable format
diff --git a/tools/server/webui/src/lib/utils/latex-protection.ts b/tools/server/webui/src/lib/utils/latex-protection.ts
index cafa2d4761f..839306978f3 100644
--- a/tools/server/webui/src/lib/utils/latex-protection.ts
+++ b/tools/server/webui/src/lib/utils/latex-protection.ts
@@ -3,7 +3,7 @@ import {
 	LATEX_MATH_AND_CODE_PATTERN,
 	LATEX_LINEBREAK_REGEXP,
 	MHCHEM_PATTERN_MAP
-} from '$lib/constants/latex-protection';
+} from '$lib/constants';
 
 /**
  * Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
diff --git a/tools/server/webui/src/lib/utils/precision.ts b/tools/server/webui/src/lib/utils/precision.ts
index 6da200cf0b7..500281dc9a2 100644
--- a/tools/server/webui/src/lib/utils/precision.ts
+++ b/tools/server/webui/src/lib/utils/precision.ts
@@ -5,7 +5,7 @@
  * and display, addressing JavaScript's floating-point precision issues.
  */
 
-import { PRECISION_MULTIPLIER } from '$lib/constants/precision';
+import { PRECISION_MULTIPLIER } from '$lib/constants';
 
 /**
  * Normalize floating-point numbers for consistent comparison
diff --git a/tools/server/webui/src/lib/utils/text-files.ts b/tools/server/webui/src/lib/utils/text-files.ts
index b7fdd4038cb..3f7a55ebc26 100644
--- a/tools/server/webui/src/lib/utils/text-files.ts
+++ b/tools/server/webui/src/lib/utils/text-files.ts
@@ -3,7 +3,7 @@
  * Handles text file detection, reading, and validation
  */
 
-import { DEFAULT_BINARY_DETECTION_OPTIONS } from '$lib/constants/binary-detection';
+import { DEFAULT_BINARY_DETECTION_OPTIONS } from '$lib/constants';
 import type { BinaryDetectionOptions } from '$lib/types';
 import { FileExtensionText } from '$lib/enums';
 
diff --git a/tools/server/webui/src/routes/+layout.svelte b/tools/server/webui/src/routes/+layout.svelte
index 705066119dc..4e9bf399000 100644
--- a/tools/server/webui/src/routes/+layout.svelte
+++ b/tools/server/webui/src/routes/+layout.svelte
@@ -14,7 +14,7 @@
 	import { Toaster } from 'svelte-sonner';
 	import { goto } from '$app/navigation';
 	import { modelsStore } from '$lib/stores/models.svelte';
-	import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
+	import { TOOLTIP_DELAY_DURATION } from '$lib/constants';
 	import { KeyboardKey } from '$lib/enums';
 	import { IsMobile } from '$lib/hooks/is-mobile.svelte';
 
diff --git a/tools/server/webui/src/routes/+page.svelte b/tools/server/webui/src/routes/+page.svelte
index 32a7c2e6e42..3f51e3ab361 100644
--- a/tools/server/webui/src/routes/+page.svelte
+++ b/tools/server/webui/src/routes/+page.svelte
@@ -3,6 +3,7 @@
 	import { chatStore } from '$lib/stores/chat.svelte';
 	import { conversationsStore, isConversationsInitialized } from '$lib/stores/conversations.svelte';
 	import { modelsStore, modelOptions } from '$lib/stores/models.svelte';
+	import { isRouterMode } from '$lib/stores/server.svelte';
 	import { onMount } from 'svelte';
 	import { page } from '$app/state';
 	import { replaceState } from '$app/navigation';
@@ -71,6 +72,14 @@
 		conversationsStore.clearActiveConversation();
 		chatStore.clearUIState();
 
+		if (
+			isRouterMode() &&
+			modelsStore.selectedModelName &&
+			!modelsStore.isModelLoaded(modelsStore.selectedModelName)
+		) {
+			modelsStore.clearSelection();
+		}
+
 		// Handle URL params only if we have ?q= or ?model= or ?new_chat=true
 		if (qParam !== null || modelParam !== null || newChatParam === 'true') {
 			await handleUrlParams();
diff --git a/tools/server/webui/src/routes/chat/[id]/+page.svelte b/tools/server/webui/src/routes/chat/[id]/+page.svelte
index b897ef5bcd4..c0b2e0c5e7a 100644
--- a/tools/server/webui/src/routes/chat/[id]/+page.svelte
+++ b/tools/server/webui/src/routes/chat/[id]/+page.svelte
@@ -99,10 +99,12 @@
 			(option) => option.model === lastMessageWithModel.model
 		);
 
-		if (matchingModel) {
+		if (matchingModel && modelsStore.isModelLoaded(matchingModel.model)) {
 			try {
 				await modelsStore.selectModelById(matchingModel.id);
-				console.log(`Automatically loaded model: ${lastMessageWithModel.model} from last message`);
+				console.log(
+					`Automatically selected model: ${lastMessageWithModel.model} from last message`
+				);
 			} catch (error) {
 				console.warn('Failed to automatically select model from last message:', error);
 			}