From 1ce0126b18c58b9c5d6586a606fdf2780162afa1 Mon Sep 17 00:00:00 2001 From: Jesse Ikonen Date: Wed, 24 Dec 2025 11:19:47 +0200 Subject: [PATCH 1/5] docs: Fix typos in SYCL documentation (#18269) --- docs/backend/SYCL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index f44458ed3b5..bcb3ce6743d 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -829,7 +829,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 No. We can't support Ollama issue directly, because we aren't familiar with Ollama. - Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it. + Suggest reproducing on llama.cpp and report similar issue to llama.cpp. We will support it. It's same for other projects including llama.cpp SYCL backend. From ce7a6dc0fc1dbeec492d262414856223a98dd4a2 Mon Sep 17 00:00:00 2001 From: Wang Weixuan Date: Wed, 24 Dec 2025 17:50:24 +0800 Subject: [PATCH 2/5] CANN : refactor ACL graph cache (#17752) Move the graph property checking code into methods of LRU cache. Signed-off-by: Wang Weixuan --- ggml/src/ggml-cann/common.h | 158 ++++++++++++++++++++++++-- ggml/src/ggml-cann/ggml-cann.cpp | 186 +++---------------------------- 2 files changed, 165 insertions(+), 179 deletions(-) diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 3a461ef1a72..e9a21e1b055 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -229,6 +229,60 @@ struct ggml_graph_node_properties { // op ggml_op node_op; int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + + /** + * @brief Check if a ggml tensor node matches this property set. + * + * This function compares all relevant fields (address, op type, shape, source inputs, op params) + * to determine whether the current node matches these previously recorded properties. + * + * @param node The current ggml tensor node. + * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise. + */ + bool has_matching_properties(ggml_tensor * node) { + if (node->data != this->node_address && node->op != GGML_OP_VIEW) { + return false; + } + + if (node->op != this->node_op) { + return false; + } + + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->ne[i] != this->ne[i]) { + return false; + } + if (node->nb[i] != this->nb[i]) { + return false; + } + } + + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (node->src[i]) { + if (node->src[i]->data != this->src_address[i] && node->op != GGML_OP_VIEW) { + return false; + } + + for (int d = 0; d < GGML_MAX_DIMS; d++) { + if (node->src[i]->ne[d] != this->src_ne[i][d]) { + return false; + } + if (node->src[i]->nb[d] != this->src_nb[i][d]) { + return false; + } + } + } else { + if (this->src_address[i] != nullptr) { + return false; + } + } + } + + if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) { + return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0; + } + return true; + } }; struct ggml_cann_graph { @@ -241,6 +295,79 @@ struct ggml_cann_graph { aclmdlRI graph = nullptr; std::vector ggml_graph_properties; + + /** + * @brief Create a new CANN graph from a ggml computation graph. + * + * This function creates a new ggml_cann_graph object and fills its node properties + * (operation type, dimensions, strides, input sources, and operation parameters) + * based on the current ggml computation graph. + * + * Each node in the ggml graph is mapped to a property entry in the new CANN graph: + * - node address + * - operation type + * - shape (ne) and strides (nb) + * - source tensor addresses + * - operation parameters + * + * @param cgraph The current ggml computation graph. + * @return Pointer to the newly created ggml_cann_graph object. + */ + static ggml_cann_graph * create_from_cgraph(ggml_cgraph * cgraph) { + ggml_cann_graph * new_graph = new ggml_cann_graph(); + new_graph->ggml_graph_properties.resize(cgraph->n_nodes); + + for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) { + ggml_tensor * node = cgraph->nodes[node_idx]; + auto & prop = new_graph->ggml_graph_properties[node_idx]; + + prop.node_address = node->data; + prop.node_op = node->op; + + std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne); + std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb); + + for (int src = 0; src < GGML_MAX_SRC; ++src) { + if (node->src[src]) { + prop.src_address[src] = node->src[src]->data; + std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]); + std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]); + } else { + prop.src_address[src] = nullptr; + std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0); + std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0); + } + } + + memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS); + } + + return new_graph; + } + + /** + * @brief Check whether this CANN graph matches the given ggml computation graph. + * + * This function compares the number of nodes and each node's properties + * (operation type, dimensions, strides, inputs, and operation parameters) + * to determine whether this CANN graph matches the given ggml graph. + * + * @param cgraph The current ggml computation graph. + * @return true if this CANN graph matches the ggml graph; false otherwise. + */ + bool matches_cgraph(ggml_cgraph * cgraph) { + if (this->ggml_graph_properties.size() != static_cast(cgraph->n_nodes)) { + return false; + } + + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (!this->ggml_graph_properties[i].has_matching_properties(cgraph->nodes[i])) { + return false; + } + } + + return true; + } }; /** @@ -272,15 +399,6 @@ struct ggml_cann_graph_lru_cache { cache_list.push_front(new_node); } - /** - * @brief Move an existing graph to the front of the cache. - * @param node Pointer to the ggml_cann_graph to move. - */ - void move_to_front(ggml_cann_graph * node) { - cache_list.remove(node); - cache_list.push_front(node); - } - /** * @brief Clear all graphs from the cache (also frees memory). */ @@ -295,6 +413,28 @@ struct ggml_cann_graph_lru_cache { * @brief Destructor that clears the cache and frees all cached graphs. */ ~ggml_cann_graph_lru_cache() { clear(); } + + /** + * @brief Find a cached CANN graph that matches the given ggml graph and move it to front. + * + * This function iterates through the cached CANN graphs stored in the LRU cache and + * compares them against the given ggml computation graph. If a matching graph is found, + * it is promoted to the front of the LRU cache and returned. Otherwise, the function + * returns nullptr. + * + * @param cgraph The current ggml computation graph. + * @return true if found; false otherwise. + */ + bool find_and_move_to_front(ggml_cgraph * cgraph) { + for (auto & graph_ptr : this->cache_list) { + if (graph_ptr->matches_cgraph(cgraph)) { + cache_list.remove(graph_ptr); + cache_list.push_front(graph_ptr); + return true; + } + } + return false; + } }; #endif // USE_ACL_GRAPH diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index da624c587c2..402e86d705d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2075,162 +2075,6 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) { ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); } -#ifdef USE_ACL_GRAPH -/** - * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph. - * - * This function creates a new ggml_cann_graph object and fills its node properties - * (operation type, dimensions, strides, input sources, and operation parameters) - * based on the current ggml computation graph. - * - * Each node in the ggml graph is mapped to a property entry in the new CANN graph: - * - node address - * - operation type - * - shape (ne) and strides (nb) - * - source tensor addresses - * - operation parameters - * - * After initialization, the new graph is pushed into the LRU cache owned by the - * CANN backend context. The cache takes ownership of the graph and manages its - * lifetime (including deletion upon eviction). - * - * @param cann_ctx The CANN backend context containing the graph cache. - * @param cgraph The current ggml computation graph. - */ -static void add_lru_matched_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { - // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache). - ggml_cann_graph * new_graph = new ggml_cann_graph(); - new_graph->ggml_graph_properties.resize(cgraph->n_nodes); - - for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) { - ggml_tensor * node = cgraph->nodes[node_idx]; - auto & prop = new_graph->ggml_graph_properties[node_idx]; - - prop.node_address = node->data; - prop.node_op = node->op; - - std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne); - std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb); - - for (int src = 0; src < GGML_MAX_SRC; ++src) { - if (node->src[src]) { - prop.src_address[src] = node->src[src]->data; - std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]); - std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]); - } else { - prop.src_address[src] = nullptr; - std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0); - std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0); - } - } - - memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS); - } - - // Insert into the LRU cache (cache takes ownership and will delete it when evicted). - cann_ctx->graph_lru_cache.push(new_graph); -} - -/** - * @brief Check if a ggml tensor node matches a previously captured CANN graph node. - * - * This function compares all relevant fields (address, op type, shape, source inputs, op params) - * to determine whether the current node matches a previously recorded version. - * - * @param node The current ggml tensor node. - * @param graph_node_properties The stored properties of a CANN graph node. - * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise. - */ -static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, - ggml_graph_node_properties * graph_node_properties) { - if (node->data != graph_node_properties->node_address && node->op != GGML_OP_VIEW) { - return false; - } - - if (node->op != graph_node_properties->node_op) { - return false; - } - - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (node->ne[i] != graph_node_properties->ne[i]) { - return false; - } - if (node->nb[i] != graph_node_properties->nb[i]) { - return false; - } - } - - for (int i = 0; i < GGML_MAX_SRC; i++) { - if (node->src[i]) { - if (node->src[i]->data != graph_node_properties->src_address[i] && node->op != GGML_OP_VIEW) { - return false; - } - - for (int d = 0; d < GGML_MAX_DIMS; d++) { - if (node->src[i]->ne[d] != graph_node_properties->src_ne[i][d]) { - return false; - } - if (node->src[i]->nb[d] != graph_node_properties->src_nb[i][d]) { - return false; - } - } - } else { - if (graph_node_properties->src_address[i] != nullptr) { - return false; - } - } - } - - if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) { - return memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0; - } - return true; -} - -/** - * @brief Check whether there is a cached CANN graph that matches the current ggml graph. - * - * This function iterates through the cached CANN graphs stored in the LRU cache and - * compares them against the given ggml computation graph. A match requires that the - * number of nodes is the same and that each node’s properties (operation type, - * dimensions, strides, inputs, and operation parameters) are identical. - * - * If a matching graph is found, it is promoted to the front of the LRU cache and the - * function returns true. Otherwise, the function returns false, indicating that a new - * CANN graph needs to be captured. - * - * @param cann_ctx The CANN backend context containing the graph cache. - * @param cgraph The current ggml computation graph. - * @return true if a matching cached graph exists; false otherwise. - */ -static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { - ggml_cann_graph_lru_cache & lru_cache = cann_ctx->graph_lru_cache; - for (auto & graph_ptr : lru_cache.cache_list) { - // Skip graphs with a different number of nodes. - if (graph_ptr->ggml_graph_properties.size() != static_cast(cgraph->n_nodes)) { - continue; - } - - // Check if all nodes match. - bool all_match = true; - for (int i = 0; i < cgraph->n_nodes; ++i) { - if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) { - all_match = false; - break; - } - } - - if (all_match) { - // update cache_list && renturn graph_ptr - lru_cache.move_to_front(graph_ptr); - return true; - } - } - - return false; -} -#endif // USE_ACL_GRAPH - /** * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API. * @@ -2239,23 +2083,23 @@ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * * * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher. * - * @param cann_ctx The CANN backend context. - * @param cgraph The ggml computation graph. - * @param use_cann_graph Whether to use CANN graph execution. - * @param cann_graph_update_required Whether graph capture is needed due to graph changes. + * @param cann_ctx The CANN backend context. + * @param cgraph The ggml computation graph. + * @param use_cann_graph Whether to use CANN graph execution. + * @param cann_graph_capture_required Whether graph capture is needed due to graph changes. */ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph, - bool & use_cann_graph, - bool & cann_graph_update_required) { + bool use_cann_graph, + bool cann_graph_capture_required) { #ifdef USE_ACL_GRAPH - if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture + if (use_cann_graph && cann_graph_capture_required) { // Begin CANN graph capture ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL)); } #endif // USE_ACL_GRAPH // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph. // With the use of CANN graphs, the execution will be performed by the graph launch. - if (!use_cann_graph || cann_graph_update_required) { + if (!use_cann_graph || cann_graph_capture_required) { for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2274,9 +2118,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx #ifdef USE_ACL_GRAPH if (use_cann_graph) { + GGML_ASSERT(!cann_ctx->graph_lru_cache.cache_list.empty()); ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); - if (cann_graph_update_required) { // End CANN graph capture + if (cann_graph_capture_required) { // End CANN graph capture ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); } @@ -2306,7 +2151,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, // calculate rope cache for fist layer in current device. cann_ctx->rope_cache.cached = false; - bool cann_graph_update_required = false; + bool graph_capture_required = false; #ifdef USE_ACL_GRAPH bool use_cann_graph = true; @@ -2331,16 +2176,17 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, if (use_cann_graph) { // If no matching graph is found, the graph needs to be recaptured. - cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph); - if (cann_graph_update_required) { + graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph); + if (graph_capture_required) { // If no matching graph is found, add a new ACL graph. - add_lru_matched_graph_node_properties(cann_ctx, cgraph); + ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph); + cann_ctx->graph_lru_cache.push(new_graph); } } #else bool use_cann_graph = false; #endif // USE_ACL_GRAPH - evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required); + evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, graph_capture_required); return GGML_STATUS_SUCCESS; } From 2a9ea2020c6d2bfe243ecd63f3c564babccc2004 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 24 Dec 2025 05:36:34 -0600 Subject: [PATCH 3/5] vulkan: fix command buffer corruption in ggml_backend_vk_event_wait (#18302) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4f50d378cdc..1459b2608e5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -13718,6 +13718,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * } static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + VK_LOG_DEBUG("ggml_backend_vk_event_record(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; @@ -13747,6 +13748,7 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev } static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + VK_LOG_DEBUG("ggml_backend_vk_event_wait(backend=" << backend << ", event=" << event << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; @@ -13762,6 +13764,8 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even } ggml_vk_wait_events(transfer_ctx, {vkev->event}); + ggml_vk_ctx_end(transfer_ctx); + ctx->transfer_ctx.reset(); } // TODO: enable async and synchronize @@ -14521,6 +14525,7 @@ static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backe } static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + VK_LOG_DEBUG("ggml_backend_vk_device_event_synchronize(backend=" << dev << ", event=" << event << ")"); ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); vk_event *vkev = (vk_event *)event->context; From 54132f1b1fa16b419d589ac03d3266178259eb25 Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Wed, 24 Dec 2025 14:02:36 +0100 Subject: [PATCH 4/5] model : support for LlamaBidirectionalModel architecture (#18220) * model: llama-embed-nemotron * minor: python lint * changed arch-name * templated llm_build_llama to be used for both llama and llama-embed arch --- convert_hf_to_gguf.py | 5 +++++ gguf-py/gguf/constants.py | 22 ++++++++++++++++++++++ src/llama-arch.cpp | 2 ++ src/llama-arch.h | 1 + src/llama-model.cpp | 13 ++++++++++--- src/models/llama.cpp | 25 +++++++++++++++++++------ src/models/models.h | 1 + 7 files changed, 60 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 16c5acf346d..c638e339866 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8695,6 +8695,11 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("LlamaBidirectionalModel") +class LlamaEmbedNemotronModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA_EMBED + + @ModelBase.register("BailingMoeForCausalLM") class BailingMoeModel(TextModel): model_arch = gguf.MODEL_ARCH.BAILINGMOE diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 41d3bd4faf2..baff8547abe 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -449,6 +449,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + LLAMA_EMBED = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -844,6 +845,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.LLAMA_EMBED: "llama-embed", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -3196,6 +3198,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.LLAMA_EMBED: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ] # TODO } diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 80f44ae1bfe..73420d3c9e1 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -115,6 +115,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -500,6 +501,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_DECI: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_LLAMA_EMBED: return { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, diff --git a/src/llama-arch.h b/src/llama-arch.h index a53bc39d183..433ee4bc18f 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -119,6 +119,7 @@ enum llm_arch { LLM_ARCH_RND1, LLM_ARCH_PANGU_EMBED, LLM_ARCH_MISTRAL3, + LLM_ARCH_LLAMA_EMBED, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0d5bcc64fe5..9fada915d7d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -606,7 +606,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { + if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) { if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } @@ -630,6 +630,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // arch-specific KVs switch (arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_LLAMA_EMBED: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -2652,6 +2653,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_LLAMA_EMBED: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -7269,16 +7271,20 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { switch (arch) { case LLM_ARCH_LLAMA: { - llm = std::make_unique(*this, params); + llm = std::make_unique>(*this, params); } break; case LLM_ARCH_LLAMA4: { if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) { - llm = std::make_unique(*this, params); + llm = std::make_unique>(*this, params); } else { llm = std::make_unique(*this, params); } } break; + case LLM_ARCH_LLAMA_EMBED: + { + llm = std::make_unique>(*this, params); + } break; case LLM_ARCH_DECI: { llm = std::make_unique(*this, params); @@ -7874,6 +7880,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_LLAMA_EMBED: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/src/models/llama.cpp b/src/models/llama.cpp index ab7fd5d0508..42b5fcdf42e 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -1,6 +1,7 @@ #include "models.h" -llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { +template +llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14,7 +15,14 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + + inp_attn_type * inp_attn = nullptr; + if constexpr (embed) { + inp_attn = build_attn_inp_no_cache(); + } else { + inp_attn = build_attn_inp_kv(); + } const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -145,11 +153,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - // lm_head - cur = build_lora_mm(model.output, cur); + if constexpr (!embed) { + // lm_head + cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - res->t_logits = cur; + cb(cur, "result_output", -1); + res->t_logits = cur; + } ggml_build_forward_expand(gf, cur); } + +template struct llm_build_llama; +template struct llm_build_llama; diff --git a/src/models/models.h b/src/models/models.h index 53a5810659a..fca505b30ab 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -303,6 +303,7 @@ struct llm_build_llada_moe : public llm_graph_context { llm_build_llada_moe(const llama_model & model, const llm_graph_params & params); }; +template struct llm_build_llama : public llm_graph_context { llm_build_llama(const llama_model & model, const llm_graph_params & params); }; From c8a2417d7b65705231eeb5b4080d3e0428c9c5d2 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 24 Dec 2025 22:28:26 +0800 Subject: [PATCH 5/5] CUDA: experimental native mxfp4 support for blackwell (#17906) * CUDA: experimental native mxfp4 support for blackwell * optimize load_tiles * optimize quantize_mxfp4 * cleanup * first pass review: formatting * use interleaved layout for mma * mmq: add assert for size * use __nv_fp4x4_e2m1 * use iter_k as 512, cleanup * Use 1200 as blackwell instead of 1000 * address review comments * mmq: fix stride * quantize.cu: use reference impl of e8m0 scale * address review comments * add 120f-virtual + minor fixes --------- Co-authored-by: Aman Gupta --- ggml/src/ggml-cuda/CMakeLists.txt | 5 + ggml/src/ggml-cuda/common.cuh | 35 ++++++ ggml/src/ggml-cuda/mma.cuh | 21 ++++ ggml/src/ggml-cuda/mmq.cu | 35 +++++- ggml/src/ggml-cuda/mmq.cuh | 179 ++++++++++++++++++++++++++++-- ggml/src/ggml-cuda/quantize.cu | 151 +++++++++++++++++++++++++ ggml/src/ggml-cuda/quantize.cuh | 14 +++ ggml/src/ggml-cuda/vendors/cuda.h | 4 + 8 files changed, 426 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 67af1d8ccc1..f1412e8b196 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -15,6 +15,7 @@ if (CUDAToolkit_FOUND) # 80 == Ampere, asynchronous data loading, faster tensor core instructions # 86 == RTX 3000, needs CUDA v11.1 # 89 == RTX 4000, needs CUDA v11.8 + # 120 == Blackwell, needs CUDA v12.8, FP4 tensor cores # # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run # XX-real == compile CUDA code as device code for this specific architecture @@ -34,6 +35,10 @@ if (CUDAToolkit_FOUND) if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real) endif() + + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES 120f-virtual) + endif() endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9fcb2f9fd21..62e618850bf 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -50,6 +50,10 @@ #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 #define GGML_CUDA_CC_ADA_LOVELACE 890 +// While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see +// https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms +#define GGML_CUDA_CC_BLACKWELL 1200 +#define GGML_CUDA_CC_RUBIN 1300 #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) @@ -246,6 +250,10 @@ static const char * cu_get_error_str(CUresult err) { #define AMPERE_MMA_AVAILABLE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL && __CUDA_ARCH__ < GGML_CUDA_CC_RUBIN +# define BLACKWELL_MMA_AVAILABLE +#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL + #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #define CP_ASYNC_AVAILABLE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE @@ -316,6 +324,11 @@ static bool cp_async_available(const int cc) { return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE; } +static bool blackwell_mma_available(const int cc) { + return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_BLACKWELL && + ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_RUBIN; +} + static constexpr __device__ int ggml_cuda_get_physical_warp_size() { #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) return 64; @@ -701,6 +714,28 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { #endif // CUDART_VERSION >= 12050 } +__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) { + const uint8_t sign_bit = (x < 0.0f) << 3; + float ax = fabsf(x) * e; + + // Positive LUT + static constexpr float pos_lut[8] = { 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f }; + + int best_i = 0; + float best_err = fabsf(ax - pos_lut[0]); + +#pragma unroll + for (int i = 1; i < 8; ++i) { + const float err = fabsf(ax - pos_lut[i]); + if (err < best_err) { + best_err = err; + best_i = i; + } + } + + return static_cast(best_i | sign_bit); +} + // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 3268dadfe83..df9eed71172 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -900,6 +900,27 @@ namespace ggml_cuda_mma { #endif // AMPERE_MMA_AVAILABLE } + static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D, + const tile<16, 8, int> & A, + const tile<8, 8, int> & B, + uint32_t a_scale, + uint32_t b_scale) { +#ifdef BLACKWELL_MMA_AVAILABLE + const int * Axi = (const int *) A.x; + const int * Bxi = (const int *) B.x; + float * Dxi = (float *) D.x; + + asm volatile( + "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, " + "%10, {0, 0}, %11, {0, 0};" + : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3]) + : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale)); +#else + GGML_UNUSED_VARS(D, A, B, a_scale, b_scale); +#endif // BLACKWELL_MMA_AVAILABLE + } + static __device__ __forceinline__ void mma( tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) { #ifdef TURING_MMA_AVAILABLE diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index f7a2cbca90f..6156dcdae74 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -1,3 +1,4 @@ +#include "common.cuh" #include "mmq.cuh" #include "quantize.cuh" #include "mmid.cuh" @@ -114,6 +115,9 @@ void ggml_cuda_mul_mat_q( const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_CDNA(cc); + // TODO: tighter pool buffer size vs q8 path + const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4; + if (!ids) { const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); @@ -123,12 +127,24 @@ void ggml_cuda_mul_mat_q( const int64_t s11 = src1->nb[1] / ts_src1; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[3] / ts_src1; - quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, - ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + if (use_native_mxfp4) { + static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1)); + quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, + ne11, ne12, ne13, stream); + + } else { + quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, + ne11, ne12, ne13, stream); + } CUDA_CHECK(cudaGetLastError()); } - const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + // Stride depends on quantization format + const int64_t s12 = use_native_mxfp4 ? + ne11 * ne10_padded * sizeof(block_fp4_mmq) / + (8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32) + : + ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int)); const int64_t s13 = ne12*s12; const mmq_args args = { @@ -175,12 +191,19 @@ void ggml_cuda_mul_mat_q( const int64_t s11 = src1->nb[1] / ts_src1; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[2] / ts_src1; - quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, - ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + + if (use_native_mxfp4) { + quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, + ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } else { + quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, + ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } CUDA_CHECK(cudaGetLastError()); } - const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) : + ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int)); const int64_t s13 = ne12*s12; // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index fa8a72c9c10..63451ffab7f 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -11,6 +11,7 @@ using namespace ggml_cuda_mma; #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available. #define MMQ_ITER_K 256 +#define MMQ_ITER_K_MXFP4_FP4 512 #define MMQ_NWARPS 8 typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride); @@ -44,8 +45,15 @@ struct block_q8_1_mmq { }; int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each }; + +struct block_fp4_mmq { + uint32_t d4[4]; // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc. + int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values +}; + static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size"); static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size"); +static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected block_fp4_mmq size"); static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { switch (type_x) { @@ -129,6 +137,14 @@ static int get_mmq_y_host(const int cc) { ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64); } +static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) { +#if defined(BLACKWELL_MMA_AVAILABLE) + return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K; +#else + return MMQ_ITER_K; +#endif // defined(BLACKWELL_MMA_AVAILABLE) +} + static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIP) #if defined(RDNA1) @@ -191,6 +207,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml } #define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) +#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) #define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) #define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4) #define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) @@ -201,6 +218,8 @@ static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding."); +static_assert(MMQ_MMA_TILE_X_K_FP4 % 8 == 4, "Wrong padding."); +static_assert(MMQ_MMA_TILE_X_K_FP4 == MMQ_MMA_TILE_X_K_Q8_1, "Wrong tile size for MXFP4"); static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { switch (type) { @@ -209,6 +228,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1; case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0; + // tile sizes are the same for Q8_1 and FP4 for blackwell case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1; case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K; case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K; @@ -228,7 +248,8 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { } // block_q8_1_mmq has (128 8-bit ints == 32 32-bit ints + 4 32-bit scales) -#define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI8_1) +#define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_1) +#define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K static int mmq_get_granularity_host(const int mmq_x, const int cc) { if (amd_mfma_available(cc) || amd_wmma_available(cc)) { @@ -761,6 +782,50 @@ template static __device__ __forceinline__ void loa } } +template +static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restrict__ x, + int * __restrict__ x_tile, + const int kbx0, + const int i_max, + const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + + int * x_qs = (int *) x_tile; + uint32_t * x_sc = (uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K); + + const int txi = threadIdx.x; + + constexpr int iter_k = get_iter_k(GGML_TYPE_MXFP4); + + constexpr int threads_per_row = iter_k / QK_MXFP4; // each thread processes 1 block + constexpr int rows_per_warp = warp_size / threads_per_row; + const int kbx = txi % threads_per_row; + const int row_in_warp = txi / threads_per_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) { + int i = i0 + threadIdx.y * rows_per_warp + row_in_warp; + + if constexpr (need_check) { + i = min(i, i_max); + } + + const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i * stride + kbx; + + // quantize_mxfp4_mmq permutes nibbles to match the quantized format + const int k0 = kbx * 4; + memcpy(x_qs + i * MMQ_MMA_TILE_X_K_FP4 + k0, bxi->qs, 16); + + // Load E8M0 scales: pack 2 consecutive scales into one uint32 + if (kbx % 2 == 0) { + uint32_t e = bxi->e; + e |= ((bxi + 1)->e << 8); + x_sc[i * MMQ_MMA_TILE_X_K_FP4 + kbx / 2] = e; + } + } +} + template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { @@ -931,6 +996,78 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) } +template +static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x, + const int * __restrict__ y, + float * __restrict__ sum, + const int k00) { + typedef tile<16, 8, int> tile_A; + typedef tile<8, 8, int> tile_B; + typedef tile<16, 8, float> tile_C; // Output is float for native scaled MMA + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = 2 * granularity; + constexpr int ntx = rows_per_warp / tile_C::I; // Number of x minitiles per warp. + + y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K); + + // Match layout from load_tiles_mxfp4_fp4 + const int * x_qs = (const int *) x; + const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K); + const int * y_qs = (const int *) y + 4; + const uint32_t * y_sc = (const uint32_t *) y; + + // tile_A has a length of 64 logical values vs. 32 values in block_mxfp4 + tile_A A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)]; + uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)]; + + // Block scale + // Each thread has to point to a 4 byte scale value + // https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { +#pragma unroll + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) { + const int k0 = k00 + k01; + + load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0, + MMQ_MMA_TILE_X_K_FP4); + + // based on block-scaling document, 2 threads in each quad need to supply to the scale value + const int tidx = threadIdx.x / 4 + (threadIdx.x % 2) * 8; + scaleA[n][k01 / (2 * QI_MXFP4)] = + *(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4)); + } + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) { +#pragma unroll + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) { + tile_B B; + uint32_t scaleB; // 2xN scales + + load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K); + + scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + + mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB); +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l]; + } + } + } + } +} + template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { @@ -3109,8 +3246,13 @@ struct mmq_type_traits { template struct mmq_type_traits { static constexpr int vdr = VDR_MXFP4_Q8_1_MMQ; +#ifdef BLACKWELL_MMA_AVAILABLE + static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4_fp4; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_mxfp4_mxfp4_mma; +#else static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4; static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; +#endif // BLACKWELL_MMA_AVAILABLE static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; @@ -3243,17 +3385,26 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( constexpr mmq_write_back_t write_back = mmq_write_back_dp4a; #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - constexpr int blocks_per_iter = MMQ_ITER_K / qk; +#if defined(BLACKWELL_MMA_AVAILABLE) + // FP4 tile stores 8 blocks + constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1; +#else + constexpr int ne_block = 4 * QK8_1; +#endif // defined(BLACKWELL_MMA_AVAILABLE) + + constexpr int ITER_K = get_iter_k(type); + constexpr int blocks_per_iter = ITER_K / qk; float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f}; + constexpr int sz = sizeof(block_q8_1_mmq) / sizeof(int); + for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); - { - const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y * (kb0 * qk / ne_block) * sz; #pragma unroll - for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) { + for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { int l = l0 + threadIdx.y*warp_size + threadIdx.x; tile_y[l] = by0[l]; @@ -3267,9 +3418,9 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( __syncthreads(); { - const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y * ((kb0 * qk / ne_block) * sz + sz); #pragma unroll - for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) { + for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { int l = l0 + threadIdx.y*warp_size + threadIdx.x; tile_y[l] = by0[l]; @@ -3401,8 +3552,10 @@ static __global__ void mul_mat_q( } #endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA + constexpr int ITER_K = get_iter_k(type); + const int64_t blocks_per_ne00 = ncols_x / qk; - constexpr int blocks_per_iter = MMQ_ITER_K / qk; + constexpr int blocks_per_iter = ITER_K / qk; // kbc == k block continuous, current index in continuous ijk space. int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; @@ -3463,7 +3616,7 @@ static __global__ void mul_mat_q( __syncthreads(); } - offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int)); offset_dst += it*mmq_y; const int tile_x_max_i = nrows_x - it*mmq_y - 1; @@ -3530,7 +3683,7 @@ static __global__ void mul_mat_q( __syncthreads(); } - offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int)); offset_dst += it*mmq_y; const int tile_x_max_i = nrows_x - it*mmq_y - 1; @@ -3553,7 +3706,9 @@ static __global__ void mul_mat_q_stream_k_fixup( const int ncols_max) { constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; - constexpr int blocks_per_iter = MMQ_ITER_K / qk; + constexpr int ITER_K = get_iter_k(type); + + constexpr int blocks_per_iter = ITER_K / qk; const int64_t blocks_per_ne00 = ncols_x / qk; constexpr int nwarps = mmq_get_nwarps_device(); @@ -3711,7 +3866,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type); const size_t nbs_ids = mmq_x*sizeof(int); const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); - const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq); + const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq)); return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int)); } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 5117f9ffc0f..a8c68e44b16 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -47,6 +47,131 @@ static __global__ void quantize_q8_1( y[ib].ds = make_half2(d, sum); } +__device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) { + if (!(amax > 0.0f)) { + return 0; + } + + // FP4 E2M1: max exponent (unbiased) is 2. + constexpr int FP4_E2M1_EMAX = 2; + + const float e = log2f(amax); + + // "even" -> round-to-nearest integer, ties-to-even + const int e_int = __float2int_rn(e); + + const int shared_exp = e_int - FP4_E2M1_EMAX; + + int biased = shared_exp + 127; + + biased = max(biased, 0); + biased = min(biased, 254); + + return static_cast(biased); +} + +// quantize values in the format mxfp4 is stored which is interleaved nibbles +// i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31 +static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, + const int32_t * __restrict__ ids, + void * __restrict__ vy, + const int64_t ne00, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t ne0, + const int ne1, + const int ne2) { + constexpr int vals_per_scale = 32; + constexpr int vals_per_warp = 2 * vals_per_scale; // Each warp processes 2 blocks of 32 = 64 values + + const int warp_id = threadIdx.y; + const int lane_id_32 = threadIdx.x; + + const int nwarps = blockDim.y; + + const int64_t warp_start_offset = (blockIdx.y * nwarps + warp_id) * vals_per_warp; + + if (warp_start_offset >= ne0) { + return; + } + + const int64_t i1 = blockIdx.x; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; + + const int64_t i01 = ids ? ids[i1] : i1; + const int64_t i02 = i2; + const int64_t i03 = i3; + + block_fp4_mmq * y = (block_fp4_mmq *) vy; + + const int64_t block_fp4_mmq_size = 8 * QK_MXFP4; // 256 values + const int64_t ib0 = blockIdx.z * ((int64_t) ne1 * (ne0 / block_fp4_mmq_size)); + const int64_t ib = ib0 + (warp_start_offset / block_fp4_mmq_size) * ne1 + blockIdx.x; + const int64_t quad_idx_in_block = (warp_start_offset % block_fp4_mmq_size) / vals_per_warp; + + const int group_id = lane_id_32 / 4; + const int lane_in_group = lane_id_32 % 4; + const int base = group_id * 2; + char2 * yqs2 = (char2 *) y[ib].qs; + + const int64_t base_pos = i03 * s03 + i02 * s02 + i01 * s01; + + uint8_t scales[2]; + +#pragma unroll + for (int b = 0; b < 2; ++b) { + const int64_t i0 = warp_start_offset + b * vals_per_scale + lane_id_32; + const float xi = (i0 < ne00) ? x[base_pos + i0] : 0.0f; + + float amax = fabsf(xi); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE)); + } + + const uint8_t e = compute_e8m0_scale(amax); + scales[b] = e; + const float inv_s = (amax == 0.0f) ? 0.0f : __frcp_rn(ggml_cuda_e8m0_to_fp32(e)); + +#if CUDART_VERSION >= 12080 + const float scaled_val = xi * inv_s; + + const float val0 = __shfl_sync(0xFFFFFFFF, scaled_val, base, WARP_SIZE); + const float val1 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 16, WARP_SIZE); + const float val2 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 1, WARP_SIZE); + const float val3 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 17, WARP_SIZE); + + if (lane_in_group == 0) { + __nv_fp4x4_e2m1 fp4_packed(make_float4(val0, val1, val2, val3)); + + yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = *(char2 *) &fp4_packed; + } +#else + // Fallback: manual FP4 conversion using LUT + const uint8_t q_val = ggml_cuda_float_to_fp4_e2m1(xi, inv_s); + + const uint8_t q_lo_0 = __shfl_sync(0xFFFFFFFF, q_val, base, WARP_SIZE); + const uint8_t q_lo_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 1, WARP_SIZE); + const uint8_t q_hi_0 = __shfl_sync(0xFFFFFFFF, q_val, base + 16, WARP_SIZE); + const uint8_t q_hi_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 17, WARP_SIZE); + + if (lane_in_group == 0) { + char2 q; + q.x = (q_hi_0 << 4) | q_lo_0; + q.y = (q_hi_1 << 4) | q_lo_1; + yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = q; + } +#endif // CUDART_VERSION >= 12080 + } + + if (lane_id_32 == 0) { + // Store 2 scales packed into 1 uint32 + y[ib].d4[quad_idx_in_block] = (scales[1] << 8) | scales[0]; + } +} + template static __global__ void quantize_mmq_q8_1( const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, @@ -190,3 +315,29 @@ void quantize_mmq_q8_1_cuda( break; } } + +void quantize_mmq_mxfp4_cuda(const float * x, + const int32_t * ids, + void * vy, + [[maybe_unused]] const ggml_type type_src0, + const int64_t ne00, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t ne0, + const int64_t ne1, + const int64_t ne2, + const int64_t ne3, + cudaStream_t stream) { + GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0); + + constexpr int nwarps = 8; + constexpr int vals_per_warp = 2 * QK_MXFP4; + constexpr int vals_per_block = nwarps * vals_per_warp; + + const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block; + const dim3 num_blocks(ne1, block_num_y, ne2 * ne3); + const dim3 block_size(WARP_SIZE, nwarps, 1); + + quantize_mmq_mxfp4<<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); +} diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh index 725ab52443c..6a91df63578 100644 --- a/ggml/src/ggml-cuda/quantize.cuh +++ b/ggml/src/ggml-cuda/quantize.cuh @@ -25,3 +25,17 @@ void quantize_mmq_q8_1_cuda( const float * x, const int32_t * ids, void * vy, ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); + +void quantize_mmq_mxfp4_cuda(const float * x, + const int32_t * ids, + void * vy, + ggml_type type_src0, + int64_t ne00, + int64_t s01, + int64_t s02, + int64_t s03, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + cudaStream_t stream); diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h index 3b3086778ee..ba032cfab4b 100644 --- a/ggml/src/ggml-cuda/vendors/cuda.h +++ b/ggml/src/ggml-cuda/vendors/cuda.h @@ -10,6 +10,10 @@ #include #endif // CUDART_VERSION >= 12050 +#if CUDART_VERSION >= 12080 +#include +#endif // CUDART_VERSION >= 12080 + #if CUDART_VERSION < 11020 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH