From 7fdc8c893d31fb3aab50ad64bb712397fbf28deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 10 Jan 2026 16:04:05 +0100 Subject: [PATCH 1/5] scripts : follow api redirects in pr2wt.sh (#18739) --- scripts/pr2wt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pr2wt.sh b/scripts/pr2wt.sh index 8e5d89462c5..bd635f3b9d8 100755 --- a/scripts/pr2wt.sh +++ b/scripts/pr2wt.sh @@ -40,7 +40,7 @@ org_repo=${org_repo%.git} echo "org/repo: $org_repo" -meta=$(curl -sSf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR") +meta=$(curl -sSLf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR") url_remote=$(echo "$meta" | jq -r '.head.repo.clone_url') head_ref=$(echo "$meta" | jq -r '.head.ref') From f307926482a465a6c7af5f212f365d50232d9cfb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 10 Jan 2026 17:51:56 +0200 Subject: [PATCH 2/5] server : adjust unified KV cache tests (#18716) --- tools/server/tests/unit/test_completion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py index ef1757db21f..2a980601ec6 100644 --- a/tools/server/tests/unit/test_completion.py +++ b/tools/server/tests/unit/test_completion.py @@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success): for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success): if expect_ok: assert res.status_code == 200 + + # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581 + if res.status_code == 200: assert "content" in res.body if "timings" in res.body: assert res.body["timings"]["predicted_n"] == n_predict - else: - assert res.status_code == 500 - assert "content" not in res.body @pytest.mark.parametrize( From 657a2e644bce23bc6fcf844544cb363e7870b2b3 Mon Sep 17 00:00:00 2001 From: Perry Naseck <4472083+DaAwesomeP@users.noreply.github.com> Date: Sat, 10 Jan 2026 11:00:54 -0500 Subject: [PATCH 3/5] cmake : update blas logic (#18205) --- ggml/src/ggml-blas/CMakeLists.txt | 20 +++++++++++++++++--- ggml/src/ggml-blas/ggml-blas.cpp | 14 +++++--------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index 60ce4b1e02c..fb0936f47b7 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -32,14 +32,12 @@ if (BLAS_FOUND) pkg_check_modules(DepBLAS openblas) endif() elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME") - add_compile_definitions(GGML_BLAS_USE_BLIS) pkg_check_modules(DepBLAS blis) elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS") pkg_check_modules(DepBLAS blas-atlas) elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS") pkg_check_modules(DepBLAS flexiblas_api) elseif (${GGML_BLAS_VENDOR} MATCHES "Intel") - add_compile_definitions(GGML_BLAS_USE_MKL) # all Intel* libraries share the same include path pkg_check_modules(DepBLAS mkl-sdl) elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC") @@ -74,10 +72,26 @@ if (BLAS_FOUND) target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS}) - if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) + if ("${GGML_BLAS_VENDOR}" STREQUAL "") + message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.") + endif() + + if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() + if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS") + add_compile_definitions(GGML_BLAS_USE_OPENBLAS) + endif() + + if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt") + add_compile_definitions(GGML_BLAS_USE_BLIS) + endif() + + if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL") + add_compile_definitions(GGML_BLAS_USE_NVPL) + endif() + target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES}) target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS}) else() diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 5b888cdd8cd..84956cbb9ce 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -115,15 +115,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg #endif } -#if defined(OPENBLAS_VERSION) +#if defined(GGML_BLAS_USE_OPENBLAS) openblas_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_BLIS) +#elif defined(GGML_BLAS_USE_BLIS) bli_thread_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_NVPL) +#elif defined(GGML_BLAS_USE_NVPL) nvpl_blas_set_num_threads(ctx->n_threads); #endif @@ -288,7 +284,7 @@ ggml_backend_t ggml_backend_blas_init(void) { /* .context = */ ctx, }; -#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) +#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP) if (openblas_get_parallel() != OPENBLAS_OPENMP) { GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); } @@ -329,7 +325,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t return "BLIS"; #elif defined(GGML_BLAS_USE_NVPL) return "NVPL"; - #elif defined(OPENBLAS_VERSION) + #elif defined(GGML_BLAS_USE_OPENBLAS) return "OpenBLAS"; #else return "BLAS"; From d2ff4e23acd0724b44e0af72fd7e37fed4c1a6a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 10 Jan 2026 17:19:01 +0100 Subject: [PATCH 4/5] HIP: adjust RDNA3.5 MMQ kernel selction logic (#18666) --- ggml/src/ggml-cuda/mmq.cu | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index c9aa7024a9c..9a69f41d159 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -333,28 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t } if (amd_wmma_available(cc)) { - // RDNA 4 is consistently worse on rocblas - // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301 if (GGML_CUDA_CC_IS_RDNA3(cc)) { - // High expert counts almost always better on MMQ - // due to a large amount of graph splits + // High expert counts are almost always better on MMQ due to + // the synchronization overhead in the cuBLAS/hipBLAS path: // https://github.com/ggml-org/llama.cpp/pull/18202 if (n_experts >= 64) { return true; } + // For some quantization types MMQ can have lower peak TOPS than hipBLAS + // so it's only faster for sufficiently small batch sizes: switch (type) { - // These quants are really bad on MMQ case GGML_TYPE_Q2_K: + return ne11 <= 128; case GGML_TYPE_Q6_K: - // These quants are usually worse but not always + return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256); case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: - return ne11 <= 128; + return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128; default: return true; } } + + // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS: + // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301 return true; } From b1377188784f9aea26b8abde56d4aee8c733eec7 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 11 Jan 2026 01:12:57 +0800 Subject: [PATCH 5/5] test-backend-ops: fix mxfp4 tests on blackwell (#18736) --- tests/test-backend-ops.cpp | 51 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 15567abedcf..56d277e1670 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -454,6 +454,28 @@ static bool ggml_is_view_op(enum ggml_op op) { return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; } +static bool backend_has_feature(ggml_backend_t backend, const char * feature_name) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + + auto get_features = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (!get_features) { + return false; + } + + const ggml_backend_feature * features = get_features(reg); + if (!features) { + return false; + } + + for (const ggml_backend_feature * f = features; f->name; ++f) { + if (strcmp(f->name, feature_name) == 0 && strcmp(f->value, "1") == 0) { + return true; + } + } + return false; +} + enum test_mode { MODE_TEST, MODE_PERF, @@ -1101,6 +1123,11 @@ struct test_case { return 1e-7; } + virtual double max_nmse_err(ggml_backend_t backend) { + GGML_UNUSED(backend); + return max_nmse_err(); + } + virtual double max_maa_err() { return 1e-4; } @@ -1109,6 +1136,10 @@ struct test_case { return max_nmse_err(); } + virtual double max_err(ggml_backend_t backend) { + return max_nmse_err(backend); + } + virtual double err(const float * a, const float * b, size_t n) { return nmse(a, b, n); } @@ -1378,8 +1409,8 @@ struct test_case { } double err = ud->tc->err(f1.data(), f2.data(), f1.size()); - if (err > ud->tc->max_err()) { - printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err()); + if (err > ud->tc->max_err(ud->backend1)) { + printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err(ud->backend1)); //for (int i = 0; i < (int) f1.size(); i++) { // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]); //} @@ -3686,6 +3717,14 @@ struct test_mul_mat : public test_case { return 5e-4; } + double max_nmse_err(ggml_backend_t backend) override { + // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance + if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { + return 2e-2; + } + return max_nmse_err(); + } + int64_t grad_nmax() override { return 20000; } @@ -3814,6 +3853,14 @@ struct test_mul_mat_id : public test_case { return 5e-4; } + double max_nmse_err(ggml_backend_t backend) override { + // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance + if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { + return 2e-2; + } + return max_nmse_err(); + } + uint64_t op_flops(ggml_tensor * t) override { GGML_UNUSED(t); return 2 * m * k * n * n_used;