LongLeCE · pull · Jan 10, 2026 · Jan 10, 2026 · Jan 10, 2026 · Jan 10, 2026
diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt
@@ -32,14 +32,12 @@ if (BLAS_FOUND)
                 pkg_check_modules(DepBLAS openblas)
             endif()
         elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-            add_compile_definitions(GGML_BLAS_USE_BLIS)
             pkg_check_modules(DepBLAS blis)
         elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
             pkg_check_modules(DepBLAS blas-atlas)
         elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
             pkg_check_modules(DepBLAS flexiblas_api)
         elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
-            add_compile_definitions(GGML_BLAS_USE_MKL)
             # all Intel* libraries share the same include path
             pkg_check_modules(DepBLAS mkl-sdl)
         elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
@@ -74,10 +72,26 @@ if (BLAS_FOUND)
 
     target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
 
-    if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+    if ("${GGML_BLAS_VENDOR}" STREQUAL "")
+        message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
         add_compile_definitions(GGML_BLAS_USE_MKL)
     endif()
 
+    if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
+        add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
+        add_compile_definitions(GGML_BLAS_USE_BLIS)
+    endif()
+
+    if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
+        add_compile_definitions(GGML_BLAS_USE_NVPL)
+    endif()
+
     target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
     target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
 else()

diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -115,15 +115,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
     }
 
-#if defined(OPENBLAS_VERSION)
+#if defined(GGML_BLAS_USE_OPENBLAS)
     openblas_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_BLIS)
+#elif defined(GGML_BLAS_USE_BLIS)
     bli_thread_set_num_threads(ctx->n_threads);
-#endif
-
-#if defined(GGML_BLAS_USE_NVPL)
+#elif defined(GGML_BLAS_USE_NVPL)
     nvpl_blas_set_num_threads(ctx->n_threads);
 #endif
 
@@ -288,7 +284,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
         /* .context = */ ctx,
     };
 
-#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
     if (openblas_get_parallel() != OPENBLAS_OPENMP) {
         GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
     }
@@ -329,7 +325,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
         return "BLIS";
     #elif defined(GGML_BLAS_USE_NVPL)
         return "NVPL";
-    #elif defined(OPENBLAS_VERSION)
+    #elif defined(GGML_BLAS_USE_OPENBLAS)
         return "OpenBLAS";
     #else
         return "BLAS";

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
@@ -333,28 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
     }
 
     if (amd_wmma_available(cc)) {
-        // RDNA 4 is consistently worse on rocblas
-        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
         if (GGML_CUDA_CC_IS_RDNA3(cc)) {
-            // High expert counts almost always better on MMQ
-            // due to a large amount of graph splits
+            // High expert counts are almost always better on MMQ due to
+            //     the synchronization overhead in the cuBLAS/hipBLAS path:
             // https://github.com/ggml-org/llama.cpp/pull/18202
             if (n_experts >= 64) {
                 return true;
             }
 
+            // For some quantization types MMQ can have lower peak TOPS than hipBLAS
+            //     so it's only faster for sufficiently small batch sizes:
             switch (type) {
-                // These quants are really bad on MMQ
                 case GGML_TYPE_Q2_K:
+                    return ne11 <= 128;
                 case GGML_TYPE_Q6_K:
-                // These quants are usually worse but not always
+                    return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
                 case GGML_TYPE_IQ2_XS:
                 case GGML_TYPE_IQ2_S:
-                    return ne11 <= 128;
+                    return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
                 default:
                     return true;
             }
         }
+
+        // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
+        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
         return true;
     }
 

diff --git a/scripts/pr2wt.sh b/scripts/pr2wt.sh
@@ -40,7 +40,7 @@ org_repo=${org_repo%.git}
 
 echo "org/repo: $org_repo"
 
-meta=$(curl -sSf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")
+meta=$(curl -sSLf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")
 
 url_remote=$(echo "$meta" | jq -r '.head.repo.clone_url')
 head_ref=$(echo "$meta" | jq -r '.head.ref')

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -454,6 +454,28 @@ static bool ggml_is_view_op(enum ggml_op op) {
     return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
 }
 
+static bool backend_has_feature(ggml_backend_t backend, const char * feature_name) {
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+
+    auto get_features = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+    if (!get_features) {
+        return false;
+    }
+
+    const ggml_backend_feature * features = get_features(reg);
+    if (!features) {
+        return false;
+    }
+
+    for (const ggml_backend_feature * f = features; f->name; ++f) {
+        if (strcmp(f->name, feature_name) == 0 && strcmp(f->value, "1") == 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
 enum test_mode {
     MODE_TEST,
     MODE_PERF,
@@ -1101,6 +1123,11 @@ struct test_case {
         return 1e-7;
     }
 
+    virtual double max_nmse_err(ggml_backend_t backend) {
+        GGML_UNUSED(backend);
+        return max_nmse_err();
+    }
+
     virtual double max_maa_err() {
         return 1e-4;
     }
@@ -1109,6 +1136,10 @@ struct test_case {
         return max_nmse_err();
     }
 
+    virtual double max_err(ggml_backend_t backend) {
+        return max_nmse_err(backend);
+    }
+
     virtual double err(const float * a, const float * b, size_t n) {
         return nmse(a, b, n);
     }
@@ -1378,8 +1409,8 @@ struct test_case {
             }
 
             double err = ud->tc->err(f1.data(), f2.data(), f1.size());
-            if (err > ud->tc->max_err()) {
-                printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err());
+            if (err > ud->tc->max_err(ud->backend1)) {
+                printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err(ud->backend1));
                 //for (int i = 0; i < (int) f1.size(); i++) {
                 //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
                 //}
@@ -3686,6 +3717,14 @@ struct test_mul_mat : public test_case {
         return 5e-4;
     }
 
+    double max_nmse_err(ggml_backend_t backend) override {
+        // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
+        if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
+            return 2e-2;
+        }
+        return max_nmse_err();
+    }
+
     int64_t grad_nmax() override {
         return 20000;
     }
@@ -3814,6 +3853,14 @@ struct test_mul_mat_id : public test_case {
         return 5e-4;
     }
 
+    double max_nmse_err(ggml_backend_t backend) override {
+        // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
+        if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
+            return 2e-2;
+        }
+        return max_nmse_err();
+    }
+
     uint64_t op_flops(ggml_tensor * t) override {
         GGML_UNUSED(t);
         return 2 * m * k * n * n_used;

diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py
@@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
     for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
         if expect_ok:
             assert res.status_code == 200
+
+        # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
+        if res.status_code == 200:
             assert "content" in res.body
             if "timings" in res.body:
                 assert res.body["timings"]["predicted_n"] == n_predict
-        else:
-            assert res.status_code == 500
-            assert "content" not in res.body
 
 
 @pytest.mark.parametrize(