Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions ggml/src/ggml-blas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,12 @@ if (BLAS_FOUND)
pkg_check_modules(DepBLAS openblas)
endif()
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
add_compile_definitions(GGML_BLAS_USE_BLIS)
pkg_check_modules(DepBLAS blis)
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS blas-atlas)
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS flexiblas_api)
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
add_compile_definitions(GGML_BLAS_USE_MKL)
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS mkl-sdl)
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
Expand Down Expand Up @@ -74,10 +72,26 @@ if (BLAS_FOUND)

target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})

if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
if ("${GGML_BLAS_VENDOR}" STREQUAL "")
message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
endif()

if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()

if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
endif()

if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
add_compile_definitions(GGML_BLAS_USE_BLIS)
endif()

if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
add_compile_definitions(GGML_BLAS_USE_NVPL)
endif()

target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
else()
Expand Down
14 changes: 5 additions & 9 deletions ggml/src/ggml-blas/ggml-blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
#endif
}

#if defined(OPENBLAS_VERSION)
#if defined(GGML_BLAS_USE_OPENBLAS)
openblas_set_num_threads(ctx->n_threads);
#endif

#if defined(GGML_BLAS_USE_BLIS)
#elif defined(GGML_BLAS_USE_BLIS)
bli_thread_set_num_threads(ctx->n_threads);
#endif

#if defined(GGML_BLAS_USE_NVPL)
#elif defined(GGML_BLAS_USE_NVPL)
nvpl_blas_set_num_threads(ctx->n_threads);
#endif

Expand Down Expand Up @@ -288,7 +284,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
/* .context = */ ctx,
};

#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
}
Expand Down Expand Up @@ -329,7 +325,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
return "BLIS";
#elif defined(GGML_BLAS_USE_NVPL)
return "NVPL";
#elif defined(OPENBLAS_VERSION)
#elif defined(GGML_BLAS_USE_OPENBLAS)
return "OpenBLAS";
#else
return "BLAS";
Expand Down
17 changes: 10 additions & 7 deletions ggml/src/ggml-cuda/mmq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -333,28 +333,31 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
}

if (amd_wmma_available(cc)) {
// RDNA 4 is consistently worse on rocblas
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
// High expert counts almost always better on MMQ
// due to a large amount of graph splits
// High expert counts are almost always better on MMQ due to
// the synchronization overhead in the cuBLAS/hipBLAS path:
// https://github.com/ggml-org/llama.cpp/pull/18202
if (n_experts >= 64) {
return true;
}

// For some quantization types MMQ can have lower peak TOPS than hipBLAS
// so it's only faster for sufficiently small batch sizes:
switch (type) {
// These quants are really bad on MMQ
case GGML_TYPE_Q2_K:
return ne11 <= 128;
case GGML_TYPE_Q6_K:
// These quants are usually worse but not always
return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
return ne11 <= 128;
return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
default:
return true;
}
}

// For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
return true;
}

Expand Down
2 changes: 1 addition & 1 deletion scripts/pr2wt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ org_repo=${org_repo%.git}

echo "org/repo: $org_repo"

meta=$(curl -sSf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")
meta=$(curl -sSLf -H "Accept: application/vnd.github+json" "https://api.github.com/repos/$org_repo/pulls/$PR")

url_remote=$(echo "$meta" | jq -r '.head.repo.clone_url')
head_ref=$(echo "$meta" | jq -r '.head.ref')
Expand Down
51 changes: 49 additions & 2 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,28 @@ static bool ggml_is_view_op(enum ggml_op op) {
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
}

static bool backend_has_feature(ggml_backend_t backend, const char * feature_name) {
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);

auto get_features = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
if (!get_features) {
return false;
}

const ggml_backend_feature * features = get_features(reg);
if (!features) {
return false;
}

for (const ggml_backend_feature * f = features; f->name; ++f) {
if (strcmp(f->name, feature_name) == 0 && strcmp(f->value, "1") == 0) {
return true;
}
}
return false;
}

enum test_mode {
MODE_TEST,
MODE_PERF,
Expand Down Expand Up @@ -1101,6 +1123,11 @@ struct test_case {
return 1e-7;
}

virtual double max_nmse_err(ggml_backend_t backend) {
GGML_UNUSED(backend);
return max_nmse_err();
}

virtual double max_maa_err() {
return 1e-4;
}
Expand All @@ -1109,6 +1136,10 @@ struct test_case {
return max_nmse_err();
}

virtual double max_err(ggml_backend_t backend) {
return max_nmse_err(backend);
}

virtual double err(const float * a, const float * b, size_t n) {
return nmse(a, b, n);
}
Expand Down Expand Up @@ -1378,8 +1409,8 @@ struct test_case {
}

double err = ud->tc->err(f1.data(), f2.data(), f1.size());
if (err > ud->tc->max_err()) {
printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err());
if (err > ud->tc->max_err(ud->backend1)) {
printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err(ud->backend1));
//for (int i = 0; i < (int) f1.size(); i++) {
// printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
//}
Expand Down Expand Up @@ -3686,6 +3717,14 @@ struct test_mul_mat : public test_case {
return 5e-4;
}

double max_nmse_err(ggml_backend_t backend) override {
// for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
return 2e-2;
}
return max_nmse_err();
}

int64_t grad_nmax() override {
return 20000;
}
Expand Down Expand Up @@ -3814,6 +3853,14 @@ struct test_mul_mat_id : public test_case {
return 5e-4;
}

double max_nmse_err(ggml_backend_t backend) override {
// for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
return 2e-2;
}
return max_nmse_err();
}

uint64_t op_flops(ggml_tensor * t) override {
GGML_UNUSED(t);
return 2 * m * k * n * n_used;
Expand Down
6 changes: 3 additions & 3 deletions tools/server/tests/unit/test_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,12 +393,12 @@ def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
if expect_ok:
assert res.status_code == 200

# note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
if res.status_code == 200:
assert "content" in res.body
if "timings" in res.body:
assert res.body["timings"]["predicted_n"] == n_predict
else:
assert res.status_code == 500
assert "content" not in res.body


@pytest.mark.parametrize(
Expand Down
Loading