From bd723005916b7616591256f4fdef0cb47ec1944f Mon Sep 17 00:00:00 2001 From: yggdrasil75 Date: Thu, 26 Feb 2026 05:26:16 -0500 Subject: [PATCH 1/7] server : fix typo in server README.md (#19900) fix typo --- tools/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/README.md b/tools/server/README.md index 0b56ca1e276..34b722a27c5 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1510,7 +1510,7 @@ version = 1 ; If the same key is defined in a specific preset, it will override the value in this global section. [*] c = 8192 -n-gpu-layer = 8 +n-gpu-layers = 8 ; If the key corresponds to an existing model on the server, ; this will be used as the default config for that model From 1ca3d1de153152645cf890b637c2c6450f1615e3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Feb 2026 12:46:32 +0200 Subject: [PATCH 2/7] gguf : avoid too many file size calls (#19919) --- ggml/src/gguf.cpp | 77 +++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 86254393664..cbeedf6c4b6 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -228,13 +228,41 @@ struct gguf_context { }; struct gguf_reader { - FILE * file; + gguf_reader(FILE * file) : file(file) { + // read the remaining bytes once and update on each read + nbytes_remain = file_remain(file); + } - gguf_reader(FILE * file) : file(file) {} + // helper for remaining bytes in a file + static uint64_t file_remain(FILE * file) { + const int64_t cur = gguf_ftell(file); + if (cur < 0) { + return 0; + } + if (gguf_fseek(file, 0, SEEK_END) != 0) { + gguf_fseek(file, cur, SEEK_SET); + + return 0; + } + const int64_t end = gguf_ftell(file); + if (end < 0) { + gguf_fseek(file, cur, SEEK_SET); + + return 0; + } + gguf_fseek(file, cur, SEEK_SET); + return static_cast(end - cur); + } template bool read(T & dst) const { - return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + const size_t size = sizeof(dst); + if (nbytes_remain < size) { + return false; + } + const size_t nread = fread(&dst, 1, size, file); + nbytes_remain -= nread; + return nread == size; } template @@ -242,20 +270,19 @@ struct gguf_reader { if (n > GGUF_MAX_ARRAY_ELEMENTS) { return false; } - const uint64_t nbytes = nbytes_remain(); if constexpr (std::is_same::value) { // strings are prefixed with their length, so we need to account for that if (n > SIZE_MAX / sizeof(uint64_t)) { return false; } - if (nbytes < n * sizeof(uint64_t)) { + if (nbytes_remain < n * sizeof(uint64_t)) { return false; } } else { if (n > SIZE_MAX / sizeof(T)) { return false; } - if (nbytes < n * sizeof(T)) { + if (nbytes_remain < n * sizeof(T)) { return false; } } @@ -312,39 +339,29 @@ struct gguf_reader { GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds maximum %" PRIu64 "\n", __func__, size, (uint64_t) GGUF_MAX_STRING_LENGTH); return false; } - const uint64_t nbytes = nbytes_remain(); - if (size > nbytes) { - GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes); + if (size > nbytes_remain) { + GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes_remain); return false; } dst.resize(static_cast(size)); - return fread(dst.data(), 1, dst.length(), file) == dst.length(); + const size_t nread = fread(dst.data(), 1, size, file); + nbytes_remain -= nread; + return nread == size; } bool read(void * dst, const size_t size) const { - return fread(dst, 1, size, file) == size; - } - - // remaining bytes in the file - uint64_t nbytes_remain() const { - const int64_t cur = gguf_ftell(file); - if (cur < 0) { - return 0; + if (size > nbytes_remain) { + return false; } - if (gguf_fseek(file, 0, SEEK_END) != 0) { - gguf_fseek(file, cur, SEEK_SET); + const size_t nread = fread(dst, 1, size, file); + nbytes_remain -= nread; + return nread == size; + } - return 0; - } - const int64_t end = gguf_ftell(file); - if (end < 0) { - gguf_fseek(file, cur, SEEK_SET); +private: + FILE * file; - return 0; - } - gguf_fseek(file, cur, SEEK_SET); - return static_cast(end - cur); - } + mutable uint64_t nbytes_remain; }; struct gguf_context * gguf_init_empty(void) { From 66287bdaaca3042bd4df5fc8f7bd4d58bfcd208c Mon Sep 17 00:00:00 2001 From: Maximilian Werk Date: Thu, 26 Feb 2026 12:14:09 +0100 Subject: [PATCH 3/7] model : add Jina Embeddings v5 Nano (partial EuroBERT) support (#19826) * WIP: Add EuroBERT support with autoformatting changes This commit includes: - EuroBERT model implementation for GGUF conversion - C++ backend support for EuroBERT architecture - Unintended autoformatting changes to Python files Saving before reverting formatting-only changes. * feat: add back eos assert when not last token pooling * feat: removed duplicated code and cleanup * feat: removed not working architectures and unnecessary check * fix: typo * fix: dynamic pooling config * feat: added an example model for eurobert * feat: proper llama-vocab implementation for jina-v5 * fix: removed unnecessary comments --- convert_hf_to_gguf.py | 29 +++++++++++ convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/constants.py | 15 ++++++ src/CMakeLists.txt | 1 + src/llama-arch.cpp | 15 ++++++ src/llama-arch.h | 1 + src/llama-model.cpp | 39 +++++++++++++++ src/llama-vocab.cpp | 3 +- src/models/eurobert.cpp | 97 ++++++++++++++++++++++++++++++++++++ src/models/models.h | 4 ++ tests/test-tokenizer-0.sh | 9 +++- tools/imatrix/imatrix.cpp | 4 +- 12 files changed, 214 insertions(+), 4 deletions(-) create mode 100644 src/models/eurobert.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e038109599a..5709cb0e3c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1148,6 +1148,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" + if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d": + # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano + res = "jina-v5-nano" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" @@ -6125,6 +6128,32 @@ def modify_tensors(self, data_torch, name, bid): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model") +class EuroBertModel(TextModel): + model_arch = gguf.MODEL_ARCH.EUROBERT + + def set_vocab(self): + self.gguf_writer.add_add_bos_token(False) + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # EuroBert is bidirectional (encoder) + self.gguf_writer.add_causal_attention(False) + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + self._try_set_pooling_type() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Strip "model." prefix from tensor names + if name.startswith("model."): + name = name[6:] + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 53a73759ec1..b31ddcca774 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -107,6 +107,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + {"name": "jina-v5-nano", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 689acdc65de..e9ef97d553c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -379,6 +379,7 @@ class MODEL_ARCH(IntEnum): NEO_BERT = auto() JINA_BERT_V2 = auto() JINA_BERT_V3 = auto() + EUROBERT = auto() BLOOM = auto() STABLELM = auto() QWEN = auto() @@ -820,6 +821,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.NEO_BERT: "neo-bert", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3", + MODEL_ARCH.EUROBERT: "eurobert", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -1587,6 +1589,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.EUROBERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN, + ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2a661a1fe88..283823fa9c8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -62,6 +62,7 @@ add_library(llama models/dream.cpp models/ernie4-5-moe.cpp models/ernie4-5.cpp + models/eurobert.cpp models/exaone-moe.cpp models/exaone.cpp models/exaone4.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 39ebb9db027..106ea133ac0 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -26,6 +26,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NEO_BERT, "neo-bert" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" }, + { LLM_ARCH_EUROBERT, "eurobert" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, @@ -819,6 +820,20 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, }; + case LLM_ARCH_EUROBERT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + }; case LLM_ARCH_MODERN_BERT: return { LLM_TENSOR_TOKEN_EMBD, diff --git a/src/llama-arch.h b/src/llama-arch.h index 11daa141334..42a6ea38f38 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -30,6 +30,7 @@ enum llm_arch { LLM_ARCH_NEO_BERT, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_JINA_BERT_V3, + LLM_ARCH_EUROBERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 52408c52577..c8ef1b5e7c2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -979,6 +979,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { type = LLM_TYPE_250M; } } break; + case LLM_ARCH_EUROBERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + + if (hparams.n_layer == 12) { + type = LLM_TYPE_SMALL; // 0.2B + } + } break; case LLM_ARCH_BLOOM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3570,6 +3580,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); } } break; + case LLM_ARCH_EUROBERT: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + } + } break; case LLM_ARCH_JINA_BERT_V2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings @@ -8181,6 +8214,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_NEO_BERT: + case LLM_ARCH_EUROBERT: case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_MODERN_BERT: case LLM_ARCH_GEMMA_EMBEDDING: @@ -8378,6 +8412,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_EUROBERT: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_BLOOM: { llm = std::make_unique(*this, params); @@ -9004,6 +9042,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_MODERN_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: + case LLM_ARCH_EUROBERT: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9c118eab7e0..194eed238ec 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1890,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral" || tokenizer_pre == "midm-2.0" || - tokenizer_pre == "lfm2") { + tokenizer_pre == "lfm2" || + tokenizer_pre == "jina-v5-nano") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp new file mode 100644 index 00000000000..86e3176edc0 --- /dev/null +++ b/src/models/eurobert.cpp @@ -0,0 +1,97 @@ +#include "models.h" + +llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + ggml_tensor * inp_pos = build_inp_pos(); + + inpL = build_inp_embd(model.tok_embd); + cb(inpL, "inp_embd", -1); + + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * cur = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + + { + ggml_tensor * Qcur; + ggml_tensor * Kcur; + ggml_tensor * Vcur; + + Qcur = build_lora_mm(model.layers[il].wq, cur); + Kcur = build_lora_mm(model.layers[il].wk, cur); + Vcur = build_lora_mm(model.layers[il].wv, cur); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + cb(cur, "kqv_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, inpL); + + ggml_tensor * ffn_inp = cur; + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_embd", -1); + res->t_embd = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/models.h b/src/models/models.h index 10f8b58921e..0712d03d8d9 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -424,6 +424,10 @@ struct llm_build_neo_bert : public llm_graph_context { llm_build_neo_bert(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_eurobert : public llm_graph_context { + llm_build_eurobert(const llama_model & model, const llm_graph_params & params); +}; + template struct llm_build_olmo2 : public llm_graph_context { llm_build_olmo2(const llama_model & model, const llm_graph_params & params); diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh index 7ef009dc903..7024b00afe3 100755 --- a/tests/test-tokenizer-0.sh +++ b/tests/test-tokenizer-0.sh @@ -13,7 +13,12 @@ fi name=$1 input=$2 -make -j tests/test-tokenizer-0 +# Build using CMake if binary doesn't exist +if [ ! -f ./build/bin/test-tokenizer-0 ]; then + printf "Building test-tokenizer-0 with CMake...\n" + cmake -B build -DLLAMA_BUILD_TESTS=ON + cmake --build build --target test-tokenizer-0 -j +fi printf "Testing %s on %s ...\n" $name $input @@ -23,7 +28,7 @@ printf "Tokenizing using (py) Python AutoTokenizer ...\n" python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 printf "Tokenizing using (cpp) llama.cpp ...\n" -./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 +./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 669de55ddb9..e025c114b48 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -912,7 +912,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c const bool add_bos = llama_vocab_get_add_bos(vocab); - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_LAST) { + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + } auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); From 9b62913b40b94c63db239b770e232bd48534d480 Mon Sep 17 00:00:00 2001 From: Eric Zhang <34133756+EZForever@users.noreply.github.com> Date: Thu, 26 Feb 2026 19:28:09 +0800 Subject: [PATCH 4/7] jinja : correct default size for string slices (#19913) --- common/jinja/runtime.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index c93e182a7e2..5757c76b7a1 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) { int64_t arr_size = 0; if (is_val(object)) { arr_size = object->as_array().size(); + } else if (is_val(object)) { + arr_size = object->as_string().length(); } if (is_stmt(this->property)) { From efba35a860954e78e72177b0d6b0c88b239d8cb1 Mon Sep 17 00:00:00 2001 From: drrros <52050875+drrros@users.noreply.github.com> Date: Thu, 26 Feb 2026 14:32:31 +0300 Subject: [PATCH 5/7] server: fix load-on-startup not respected in ini file (#19897) Co-authored-by: Roman Marchenko --- tools/server/server-models.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index efb22da5c3d..e1625477993 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -291,7 +291,9 @@ void server_models::load_models() { for (const auto & [name, inst] : mapping) { std::string val; if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) { - models_to_load.push_back(name); + if (common_arg_utils::is_truthy(val)) { + models_to_load.push_back(name); + } } } if ((int)models_to_load.size() > base_params.models_max) { From ffaafde16ffebd2853467f0dd833625a726ce08e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 26 Feb 2026 13:00:57 +0100 Subject: [PATCH 6/7] ggml-virtgpu: improve the reliability of the code (#19846) * ggml-virtgpu-backend: validate the consistency of the received objects This patch adds consistency checks in the ggml-virtgpu-backend (running on the host side) to ensure that the data received from the guest is consistent (valid pointers, valid sizes and offsets). * ggml-virtgpu-backend: add fallback/skips for optional ggml backend methods ``` 1. bck->iface.synchronize(bck) 2. buft->iface.get_alloc_size(buft, op) 3. buft->iface.get_max_size(buft) ``` these three methods are optional in the GGML interface. `get_max_size` was already properly defaulted, but `backend sychronize` and `butf get_max_size` would have segfaulted the backend if not implemented. * ggml-virtgpu-backend: fix log format missing argument * ggml-virtgpu-backend: improve the abort message * ggml-virtgpu-backend: more safety checks * ggml-virtgpu-backend: new error code * ggml-virtgpu-backend: initialize all the error codes * ggml-virtgpu: add a missing comment generated by the code generator * ggml-virtgpu: add the '[virtgpu]' prefix to the device/buffer names * ggml-virtgpu: apir_device_buffer_from_ptr: improve the error message * ggml-virtgpu: shared: make it match the latest api_remoting.h of Virglrenderer APIR (still unmerged) * ggml-virtgpu: update the code generator to have dispatch_command_name in a host/guest shared file * ggml-virtgpu: REMOTE_CALL: fail if the backend returns an error * docs/backend/VirtGPU.md: indicate that the RAM+VRAM size is limed to 64 GB with libkrun * ggml-virtgpu: turn off clang-format header ordering for some of the files Compilation breaks when ordered alphabetically. * ggml-virtgpu: clang-format * ggml-virtgpu/backend/shared/api_remoting: better comments for the APIR return codes --- docs/backend/VirtGPU.md | 4 +- .../backend/backend-dispatched-backend.cpp | 43 +++++++++- .../backend-dispatched-buffer-type.cpp | 14 ++- .../backend/backend-dispatched-buffer.cpp | 48 +++++++++++ .../backend/backend-dispatched.cpp | 13 ++- .../backend/backend-dispatched.gen.h | 58 ------------- .../ggml-virtgpu/backend/backend-dispatched.h | 2 + .../ggml-virtgpu/backend/backend-virgl-apir.h | 2 +- ggml/src/ggml-virtgpu/backend/backend.cpp | 38 ++++----- .../backend/shared/api_remoting.h | 21 +++-- .../backend/shared/apir_backend.gen.h | 58 +++++++++++++ .../backend/shared/apir_backend.h | 6 +- .../src/ggml-virtgpu/backend/shared/apir_cs.h | 20 ++--- .../backend/shared/apir_cs_ggml.h | 27 ++++-- .../ggml-virtgpu/backend/shared/apir_cs_rpc.h | 4 + .../ggml-virtgpu/ggml-backend-buffer-type.cpp | 6 +- ggml/src/ggml-virtgpu/ggml-backend-device.cpp | 7 +- ggml/src/ggml-virtgpu/ggml-backend-reg.cpp | 56 ++++++++---- ggml/src/ggml-virtgpu/ggml-backend.cpp | 2 +- ggml/src/ggml-virtgpu/ggml-remoting.h | 2 +- ggml/src/ggml-virtgpu/include/apir_hw.h | 6 +- ggml/src/ggml-virtgpu/regenerate_remoting.py | 47 +++++----- .../ggml-virtgpu/virtgpu-forward-backend.cpp | 6 +- .../virtgpu-forward-buffer-type.cpp | 8 +- .../ggml-virtgpu/virtgpu-forward-buffer.cpp | 12 +-- .../ggml-virtgpu/virtgpu-forward-device.cpp | 4 +- ggml/src/ggml-virtgpu/virtgpu-forward-impl.h | 47 +++++----- ggml/src/ggml-virtgpu/virtgpu-forward.gen.h | 1 + ggml/src/ggml-virtgpu/virtgpu.cpp | 85 ++++++++----------- ggml/src/ggml-virtgpu/virtgpu.h | 8 +- 30 files changed, 398 insertions(+), 257 deletions(-) diff --git a/docs/backend/VirtGPU.md b/docs/backend/VirtGPU.md index c81468da13f..72f376dea7b 100644 --- a/docs/backend/VirtGPU.md +++ b/docs/backend/VirtGPU.md @@ -152,7 +152,9 @@ Commands and data are serialized using a custom binary protocol with: - **VM-specific**: Only works in virtual machines with virtio-gpu support - **Host dependency**: Requires properly configured host-side backend - **Latency**: Small overhead from VM escaping for each operation - +- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM + addressable memory is limited to 64 GB. So the maximum GPU memory + will be `64GB - RAM`, regardless of the hardware VRAM size. * This work is pending upstream changes in the VirglRenderer project. diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp index cc879e51d04..03a037f1cbd 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp @@ -7,9 +7,21 @@ #include +static uint32_t validate_graph_operation(size_t cgraph_size, uint32_t shmem_res_id, const char * operation) { + if (cgraph_size == 0) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Zero-size computation graph\n", operation); + return 1; + } + + // place-holder: validate that the size of shmem_res_id is <= cgraph_size + // need to add another method in the Virgl->APIR callback interface + GGML_UNUSED(shmem_res_id); + + return 0; // Valid +} + uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { GGML_UNUSED(ctx); - GGML_UNUSED(enc); static bool async_backend_initialized = false; static bool async_backend; @@ -34,10 +46,26 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v size_t cgraph_size; apir_decode_size_t(dec, &cgraph_size); + if (validate_graph_operation(cgraph_size, shmem_res_id, __func__) != 0) { + apir_decoder_set_fatal(dec); + return 1; + } + apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size); ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size); + if (!cgraph || apir_decoder_get_fatal(&secondary_dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to deserialize computation graph\n", __func__); + return 1; + } + + if (cgraph->n_nodes < 0 || cgraph->n_leafs < 0) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid negative node/leaf count: nodes=%d leafs=%d\n", __func__, + cgraph->n_nodes, cgraph->n_leafs); + return 1; + } + ggml_status status; #if APIR_BACKEND_CHECK_SUPPORTS_OP == 1 for (int idx = 0; idx < cgraph->n_nodes; idx++) { @@ -45,7 +73,8 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v if (dev->iface.supports_op(dev, op)) { continue; } - GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op)); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", __func__, idx, + ggml_op_desc(op)); status = GGML_STATUS_ABORTED; apir_encode_ggml_status(enc, &status); @@ -53,9 +82,17 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v return 0; } #endif + + // Check if backend is properly initialized + if (!bck) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Backend not initialized (bck is null)\n", __func__); + + return 1; + } + status = bck->iface.graph_compute(bck, cgraph); - if (async_backend) { + if (async_backend && bck->iface.synchronize) { bck->iface.synchronize(bck); } diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp index d55eec27610..c66dbaa9e8f 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp @@ -85,7 +85,19 @@ uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * d const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec); - size_t value = buft->iface.get_alloc_size(buft, op); + // Check for decode error + if (op == nullptr) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to decode tensor\n", __func__); + apir_decoder_set_fatal(dec); + return 1; + } + + size_t value; + if (buft->iface.get_alloc_size) { + value = buft->iface.get_alloc_size(buft, op); + } else { + value = ggml_nbytes(op); // Default fallback + } apir_encode_size_t(enc, &value); diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp index 8cc063ff0a6..3ade8d99b4e 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp @@ -6,11 +6,26 @@ #include +static uint32_t validate_buffer_operation(size_t offset, size_t size, const char * operation) { + // Only check for critical integer overflow - no arbitrary size limits + if (offset > SIZE_MAX - size) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Integer overflow in offset+size: %zu + %zu\n", operation, offset, size); + return 1; + } + + return 0; // Valid +} + uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { GGML_UNUSED(ctx); ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); apir_encode_uintptr_t(enc, &base); @@ -24,6 +39,11 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + ggml_tensor * tensor; // safe to remove the const qualifier here tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec); @@ -37,6 +57,10 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl size_t size; apir_decode_size_t(dec, &size); + if (validate_buffer_operation(offset, size, __func__) != 0) { + return 1; + } + void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_data) { @@ -56,6 +80,11 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + const ggml_tensor * tensor; // safe to remove the const qualifier here tensor = apir_decode_ggml_tensor(dec); @@ -69,6 +98,10 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl size_t size; apir_decode_size_t(dec, &size); + if (validate_buffer_operation(offset, size, __func__) != 0) { + return 1; + } + void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_data) { GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__); @@ -86,6 +119,11 @@ uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + const ggml_tensor * src; // safe to remove the const qualifier here src = apir_decode_ggml_tensor(dec); @@ -105,6 +143,11 @@ uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + uint8_t value; apir_decode_uint8_t(dec, &value); @@ -120,6 +163,11 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + if (!apir_untrack_backend_buffer(buffer)) { GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer); return 1; diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp index 64152eef0d8..c80e4aabe1f 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp @@ -1,6 +1,6 @@ #include "backend-dispatched.h" -#include "backend-virgl-apir.h" +#include "backend-virgl-apir.h" #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-impl.h" @@ -28,19 +28,24 @@ uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) { return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED; } - if (!reg->iface.get_device_count(reg)) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__); + size_t device_count = reg->iface.get_device_count(reg); + if (!device_count) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: no device found\n", __func__); return APIR_BACKEND_INITIALIZE_NO_DEVICE; } dev = reg->iface.get_device(reg, 0); if (!dev) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: failed to get device\n", __func__); return APIR_BACKEND_INITIALIZE_NO_DEVICE; } bck = dev->iface.init_backend(dev, NULL); + if (!bck) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed\n", __func__); + return APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED; + } return APIR_BACKEND_INITIALIZE_SUCCESS; } diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h index 481d7f3150d..3dc334e4ce4 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h @@ -32,64 +32,6 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg /* backend */ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); -static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) { - switch (type) { - /* device */ - case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT: - return "backend_device_get_device_count"; - case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: - return "backend_device_get_count"; - case APIR_COMMAND_TYPE_DEVICE_GET_NAME: - return "backend_device_get_name"; - case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: - return "backend_device_get_description"; - case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: - return "backend_device_get_type"; - case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: - return "backend_device_get_memory"; - case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: - return "backend_device_supports_op"; - case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: - return "backend_device_get_buffer_type"; - case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: - return "backend_device_get_props"; - case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: - return "backend_device_buffer_from_ptr"; - /* buffer-type */ - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: - return "backend_buffer_type_get_name"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: - return "backend_buffer_type_get_alignment"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: - return "backend_buffer_type_get_max_size"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: - return "backend_buffer_type_is_host (DEPRECATED)"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: - return "backend_buffer_type_alloc_buffer"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE: - return "backend_buffer_type_get_alloc_size"; - /* buffer */ - case APIR_COMMAND_TYPE_BUFFER_GET_BASE: - return "backend_buffer_get_base"; - case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: - return "backend_buffer_set_tensor"; - case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: - return "backend_buffer_get_tensor"; - case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR: - return "backend_buffer_cpy_tensor"; - case APIR_COMMAND_TYPE_BUFFER_CLEAR: - return "backend_buffer_clear"; - case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: - return "backend_buffer_free_buffer"; - /* backend */ - case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: - return "backend_backend_graph_compute"; - - default: - return "unknown"; - } -} - extern "C" { static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h index 10311631d4f..740ee9e3ffc 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h @@ -1,5 +1,6 @@ #pragma once +// clang-format off #include #include @@ -10,6 +11,7 @@ #include "shared/apir_backend.h" #include "shared/apir_cs.h" #include "shared/apir_cs_ggml.h" +// clang-format on #define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: " diff --git a/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h b/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h index 44b347f853f..c65a01cdf9b 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +++ b/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h @@ -19,7 +19,7 @@ struct virgl_apir_callbacks { }; extern "C" { -ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs); +ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs); void apir_backend_deinit(uint32_t virgl_ctx_id); uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id, virgl_apir_callbacks * virgl_cbs, diff --git a/ggml/src/ggml-virtgpu/backend/backend.cpp b/ggml/src/ggml-virtgpu/backend/backend.cpp index d93414a078b..535a05f3e69 100644 --- a/ggml/src/ggml-virtgpu/backend/backend.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend.cpp @@ -1,6 +1,5 @@ #include "backend-dispatched.h" #include "backend-virgl-apir.h" - #include "shared/api_remoting.h" #include "shared/apir_backend.h" #include "shared/apir_cs.h" @@ -17,10 +16,10 @@ #define GGML_DEFAULT_BACKEND_REG "ggml_backend_init" static void * backend_library_handle = NULL; -static FILE * apir_logfile = NULL; +static FILE * apir_logfile = NULL; static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) { - FILE * logfile = (FILE *)user_data; + FILE * logfile = (FILE *) user_data; fprintf(logfile, "[%d] %s", level, text); fflush(logfile); } @@ -48,9 +47,9 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) { } #define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path" -#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg" +#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg" -ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) { +ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs) { const char * dlsym_error; const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV); @@ -63,15 +62,13 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct } } - const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY); + const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY); const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY); - const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG; + const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG; if (!library_name) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot open the GGML library: env var '%s' not defined\n", - __func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV); - + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: env var '%s' not defined\n", __func__, + APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV); return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; } @@ -79,16 +76,14 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct backend_library_handle = dlopen(library_name, RTLD_LAZY); if (!backend_library_handle) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot open the GGML library: %s\n", __func__, dlerror()); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: %s\n", __func__, dlerror()); return APIR_LOAD_LIBRARY_CANNOT_OPEN; } if (!library_reg) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot register the GGML library: env var '%s' not defined\n", - __func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot register the GGML library: env var '%s' not defined\n", __func__, + APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV); return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; } @@ -96,11 +91,9 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg); dlsym_error = dlerror(); if (dlsym_error) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n", + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n", __func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error); - return APIR_LOAD_LIBRARY_SYMBOL_MISSING; } @@ -132,13 +125,12 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id, virgl_apir_context ctx = { .ctx_id = virgl_ctx_id, - .iface = virgl_cbs, + .iface = virgl_cbs, }; if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: Received an invalid dispatch index (%d >= %d)\n", - __func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Received an invalid dispatch index (%d >= %d)\n", __func__, cmd_type, + APIR_BACKEND_DISPATCH_TABLE_COUNT); return APIR_BACKEND_FORWARD_INDEX_INVALID; } diff --git a/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h b/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h index f19a5d12d17..6bf97e8a3a2 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +++ b/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h @@ -16,28 +16,32 @@ enum ApirCommandType { APIR_COMMAND_TYPE_LOADLIBRARY = 1, APIR_COMMAND_TYPE_FORWARD = 2, - APIR_COMMAND_TYPE_LENGTH = 3, + APIR_COMMAND_TYPE_LENGTH = 3, }; typedef uint64_t ApirCommandFlags; enum ApirLoadLibraryReturnCode { APIR_LOAD_LIBRARY_SUCCESS = 0, + // these error codes are returned by the Virglrenderer APIR component APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1, APIR_LOAD_LIBRARY_ALREADY_LOADED = 2, APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3, APIR_LOAD_LIBRARY_CANNOT_OPEN = 4, APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5, - APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code + // any value greater than this is an APIR *backend library* initialization return code + APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, }; enum ApirForwardReturnCode { - APIR_FORWARD_SUCCESS = 0, - APIR_FORWARD_NO_DISPATCH_FCT = 1, - APIR_FORWARD_TIMEOUT = 2, - - APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code -} ; + APIR_FORWARD_SUCCESS = 0, + // these error codes are returned by the Virglrenderer APIR component + APIR_FORWARD_NO_DISPATCH_FCT = 1, + APIR_FORWARD_TIMEOUT = 2, + APIR_FORWARD_FAILED_TO_SYNC_STREAMS = 3, + // any value greater than this index an APIR *backend library* forward return code + APIR_FORWARD_BASE_INDEX = 4, +}; __attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) { switch (type) { @@ -82,6 +86,7 @@ __attribute__((unused)) static const char * apir_forward_error(ApirForwardReturn APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS); APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT); APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT); + APIR_FORWARD_ERROR(APIR_FORWARD_FAILED_TO_SYNC_STREAMS); APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX); return "Unknown APIR_COMMAND_TYPE_FORWARD error"; diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h index d214b6f2a90..520ac9c7299 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h @@ -34,3 +34,61 @@ typedef enum ApirBackendCommandType { // last command_type index + 1 APIR_BACKEND_DISPATCH_TABLE_COUNT = 23, } ApirBackendCommandType; + +static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) { + switch (type) { + /* device */ + case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT: + return "device_get_device_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: + return "device_get_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_NAME: + return "device_get_name"; + case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: + return "device_get_description"; + case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: + return "device_get_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: + return "device_get_memory"; + case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: + return "device_supports_op"; + case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: + return "device_get_buffer_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: + return "device_get_props"; + case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: + return "device_buffer_from_ptr"; + /* buffer-type */ + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: + return "buffer_type_get_name"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: + return "buffer_type_get_alignment"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: + return "buffer_type_get_max_size"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: + return "buffer_type_is_host"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: + return "buffer_type_alloc_buffer"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE: + return "buffer_type_get_alloc_size"; + /* buffer */ + case APIR_COMMAND_TYPE_BUFFER_GET_BASE: + return "buffer_get_base"; + case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: + return "buffer_set_tensor"; + case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: + return "buffer_get_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR: + return "buffer_cpy_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CLEAR: + return "buffer_clear"; + case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: + return "buffer_free_buffer"; + /* backend */ + case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: + return "backend_graph_compute"; + + default: + return "unknown"; + } +} diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h index f3efa52c721..da1e21b5b2f 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h @@ -14,7 +14,7 @@ #define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6 #define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7 #define APIR_BACKEND_INITIALIZE_NO_DEVICE 8 - +#define APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED 9 // new entries here need to be added to the apir_backend_initialize_error function below @@ -39,6 +39,10 @@ static const char * apir_backend_initialize_error(int code) { APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS); APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS); APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_ALREADY_INITED); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_NO_DEVICE); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED); return "Unknown APIR_BACKEND_INITIALIZE error:/"; diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h index 1bc3a5f685b..64bf2ec9609 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h @@ -13,7 +13,6 @@ struct apir_encoder { const char * start; const char * end; bool fatal; - }; struct apir_decoder { @@ -28,8 +27,8 @@ struct apir_decoder { static apir_decoder apir_new_decoder(const char * ptr, size_t size) { apir_decoder dec = { - .cur = ptr, - .end = ptr + size, + .cur = ptr, + .end = ptr + size, .fatal = false, }; @@ -79,10 +78,7 @@ static inline bool apir_decoder_get_fatal(const apir_decoder * dec) { * encode peek */ -static inline bool apir_decoder_peek_internal(apir_decoder * dec, - size_t size, - void * val, - size_t val_size) { +static inline bool apir_decoder_peek_internal(apir_decoder * dec, size_t size, void * val, size_t val_size) { assert(val_size <= size); if (unlikely(size > (size_t) (dec->end - dec->cur))) { @@ -332,8 +328,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t static inline void * apir_decoder_alloc_array(size_t size, size_t count) { size_t alloc_size; if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) { - GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", - __func__, size, count); + GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", __func__, size, count); return NULL; } @@ -352,20 +347,19 @@ static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) { /* apir_buffer_type_host_handle_t */ -static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc, +static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc, const apir_buffer_type_host_handle_t * val) { apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); } -static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec, +static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec, apir_buffer_type_host_handle_t * val) { apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); } /* apir_buffer_host_handle_t */ -static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, - const apir_buffer_host_handle_t * val) { +static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, const apir_buffer_host_handle_t * val) { apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); } diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h index 289f4b77d74..fabe3e401ca 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h @@ -1,11 +1,10 @@ -#include "ggml-impl.h" #include "apir_cs.h" #include "apir_cs_rpc.h" +#include "ggml-impl.h" // ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer); -static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, - const apir_buffer_host_handle_t * handle); +static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle); static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec); @@ -22,8 +21,7 @@ static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); } -static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, - uint32_t n_tensors) { +static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, uint32_t n_tensors) { size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors; return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); @@ -45,9 +43,9 @@ static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) { } ggml_init_params params{ - /*.mem_size =*/ ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, + /*.mem_size =*/ggml_tensor_overhead(), + /*.mem_buffer =*/NULL, + /*.no_alloc =*/true, }; ggml_context * ctx = ggml_init(params); @@ -105,6 +103,19 @@ static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size); + // SECURITY: Validate buffer handle against tracked buffers to prevent + // guest VM from providing arbitrary host memory addresses + if (buffer) { + extern std::unordered_set backend_buffers; + if (backend_buffers.find(buffer) == backend_buffers.end()) { + GGML_LOG_WARN("ggml-virtgpu-backend: %s: Invalid buffer handle from guest: %p\n", __func__, + (void *) buffer); + // Set fatal flag to prevent further processing with invalid handle + apir_decoder_set_fatal(dec); + return NULL; + } + } + return buffer; } diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h index f6817989528..4cb2f047d1e 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h @@ -1,3 +1,6 @@ +#pragma once + +// clang-format off #include "ggml.h" #include "ggml-backend-impl.h" @@ -5,6 +8,7 @@ #include #include #include +// clang-format on // ggml_tensor is serialized into apir_rpc_tensor struct apir_rpc_tensor { diff --git a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp index c493a8e2ae3..8fa20ff43bd 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp @@ -34,6 +34,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { virtgpu * gpu = BUFT_TO_GPU(buft); + // Return the prefixed name that was built once during initialization return gpu->cached_buffer_type.name; } @@ -53,9 +54,8 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff const ggml_tensor * tensor) { virtgpu * gpu = BUFT_TO_GPU(buft); - if (tensor->buffer == NULL - || !tensor->buffer->context - || !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) { + if (tensor->buffer == NULL || !tensor->buffer->context || + !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) { return ggml_nbytes(tensor); } diff --git a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp index c7d2881058b..ec8156bb868 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp @@ -3,6 +3,7 @@ static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); + // Return the prefixed name that was built once during initialization return gpu->cached_device_info.name; } @@ -22,7 +23,7 @@ static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_bac static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { virtgpu * gpu = DEV_TO_GPU(dev); - *free = gpu->cached_device_info.memory_free; + *free = gpu->cached_device_info.memory_free; *total = gpu->cached_device_info.memory_total; } @@ -72,7 +73,7 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - static std::atomic initialized = false; + static std::atomic initialized = false; static ggml_backend_buffer_type buft; if (!initialized) { @@ -95,7 +96,7 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - static std::atomic initialized = false; + static std::atomic initialized = false; static ggml_backend_buffer_type buft; if (!initialized) { diff --git a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp index 2d02cfec1d3..a4df5956aa3 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp @@ -7,8 +7,8 @@ void ggml_virtgpu_cleanup(virtgpu * gpu); static virtgpu * apir_initialize() { - static virtgpu * gpu = NULL; - static std::atomic initialized = false; + static virtgpu * gpu = NULL; + static std::atomic initialized = false; if (initialized) { // fast track @@ -31,29 +31,53 @@ static virtgpu * apir_initialize() { } // Pre-fetch and cache all device information, it will not change - gpu->cached_device_info.description = apir_device_get_description(gpu); + gpu->cached_device_info.description = apir_device_get_description(gpu); if (!gpu->cached_device_info.description) { GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__); } - gpu->cached_device_info.name = apir_device_get_name(gpu); - if (!gpu->cached_device_info.name) { - GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__); - } gpu->cached_device_info.device_count = apir_device_get_count(gpu); gpu->cached_device_info.type = apir_device_get_type(gpu); - apir_device_get_memory(gpu, - &gpu->cached_device_info.memory_free, - &gpu->cached_device_info.memory_total); + { + // Get the remote name and create prefixed version + char * rmt_device_name = apir_device_get_name(gpu); + if (!rmt_device_name) { + GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu device name", __func__); + } + + size_t device_name_len = strlen(rmt_device_name) + 11; // "[virtgpu] " + null terminator + gpu->cached_device_info.name = (char *) malloc(device_name_len); + if (!gpu->cached_device_info.name) { + free(rmt_device_name); + GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed device name", __func__); + } + snprintf(gpu->cached_device_info.name, device_name_len, "[virtgpu] %s", rmt_device_name); + free(rmt_device_name); + } + + apir_device_get_memory(gpu, &gpu->cached_device_info.memory_free, &gpu->cached_device_info.memory_total); apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu); gpu->cached_buffer_type.host_handle = buft_host_handle; - gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle); - if (!gpu->cached_buffer_type.name) { - GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__); + { + // Get the remote name and create prefixed version + char * rmt_name = apir_buffer_type_get_name(gpu, buft_host_handle); + if (!rmt_name) { + GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu buffer type name", __func__); + } + + size_t prefixed_len = strlen(rmt_name) + 11; // "[virtgpu] " + null terminator + gpu->cached_buffer_type.name = (char *) malloc(prefixed_len); + if (!gpu->cached_buffer_type.name) { + free(rmt_name); + GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed buffer type name", __func__); + } + snprintf(gpu->cached_buffer_type.name, prefixed_len, "[virtgpu] %s", rmt_name); + free(rmt_name); } - gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle); - gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle); + + gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle); + gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle); initialized = true; } @@ -98,7 +122,7 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { static std::atomic initialized = false; if (initialized) { - return; // fast track + return; // fast track } { diff --git a/ggml/src/ggml-virtgpu/ggml-backend.cpp b/ggml/src/ggml-virtgpu/ggml-backend.cpp index 5cd6c0c0608..a63ee2b9d2f 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend.cpp @@ -1,5 +1,5 @@ -#include "ggml-remoting.h" #include "../../include/ggml-virtgpu.h" +#include "ggml-remoting.h" static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { UNUSED(backend); diff --git a/ggml/src/ggml-virtgpu/ggml-remoting.h b/ggml/src/ggml-virtgpu/ggml-remoting.h index 08766408676..4f70326bee2 100644 --- a/ggml/src/ggml-virtgpu/ggml-remoting.h +++ b/ggml/src/ggml-virtgpu/ggml-remoting.h @@ -9,7 +9,7 @@ #include #define GGML_VIRTGPU_NAME "ggml-virtgpu" -#define GGML_VIRTGPU "ggml-virtgpu: " +#define GGML_VIRTGPU "ggml-virtgpu: " // USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes diff --git a/ggml/src/ggml-virtgpu/include/apir_hw.h b/ggml/src/ggml-virtgpu/include/apir_hw.h index 33af045ca2b..7d6ea2265db 100644 --- a/ggml/src/ggml-virtgpu/include/apir_hw.h +++ b/ggml/src/ggml-virtgpu/include/apir_hw.h @@ -3,7 +3,7 @@ #include struct virgl_renderer_capset_apir { - uint32_t apir_version; - uint32_t supports_blob_resources; - uint32_t reserved[4]; // For future expansion + uint32_t apir_version; + uint32_t supports_blob_resources; + uint32_t reserved[4]; // For future expansion }; diff --git a/ggml/src/ggml-virtgpu/regenerate_remoting.py b/ggml/src/ggml-virtgpu/regenerate_remoting.py index aeb48a4087e..dae75fd1c80 100755 --- a/ggml/src/ggml-virtgpu/regenerate_remoting.py +++ b/ggml/src/ggml-virtgpu/regenerate_remoting.py @@ -145,8 +145,31 @@ def generate_apir_backend_header(self) -> str: enum_lines.append(f" APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},") enum_lines.append("} ApirBackendCommandType;") + # Generate function name mapping + func_lines = [] + func_lines.append("static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {") + func_lines.append(" switch (type) {") + + current_group = None + for func in functions: + # Add comment for new group + if func['group_name'] != current_group: + func_lines.append(f" /* {func['group_description']} */") + current_group = func['group_name'] + + # Generate clean function name without backend_ prefix + clean_name = f"{func['group_name']}_{func['function_name']}" + func_lines.append(f" case {func['enum_name']}:") + func_lines.append(f" return \"{clean_name}\";") + + func_lines.append("") + func_lines.append(" default:") + func_lines.append(" return \"unknown\";") + func_lines.append(" }") + func_lines.append("}") + # Full header template - header_content = NL.join(enum_lines) + "\n" + header_content = NL.join(enum_lines) + "\n\n" + NL.join(func_lines) + "\n" return header_content @@ -170,19 +193,6 @@ def generate_backend_dispatched_header(self) -> str: decl_lines.append(f"{signature} {func['backend_function']}({params});") - # Switch cases - switch_lines = [] - current_group = None - - for func in functions: - if func['group_name'] != current_group: - switch_lines.append(f" /* {func['group_description']} */") - current_group = func['group_name'] - - deprecated = " (DEPRECATED)" if func['deprecated'] else "" - - switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";") - # Dispatch table table_lines = [] current_group = None @@ -201,15 +211,6 @@ def generate_backend_dispatched_header(self) -> str: {NL.join(decl_lines)} -static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) -{{ - switch (type) {{ -{NL.join(switch_lines)} - - default: return "unknown"; - }} -}} - extern "C" {{ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{ {NL.join(table_lines)} diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp index 07d9a668496..4593690c638 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp @@ -17,8 +17,8 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) { size_t cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data); virtgpu_shmem temp_shmem; // Local storage for large buffers - virtgpu_shmem * shmem = &temp_shmem; - bool using_shared_shmem = false; + virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (cgraph_size <= gpu->data_shmem.mmap_size) { // Lock mutex before using shared data_shmem buffer @@ -26,7 +26,7 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) { GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); } using_shared_shmem = true; - shmem = &gpu->data_shmem; + shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) { GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); } diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp index cab74fd1707..38f8ec945e0 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp @@ -62,7 +62,9 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle return max_size; } -apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) { +apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, + apir_buffer_type_host_handle_t host_handle, + size_t size) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; @@ -84,7 +86,9 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_t return buffer_context; } -size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) { +size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, + apir_buffer_type_host_handle_t host_handle, + const ggml_tensor * op) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp index 86eee358cf4..228284f4a42 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp @@ -35,8 +35,8 @@ void apir_buffer_set_tensor(virtgpu * gpu, apir_encode_ggml_tensor(encoder, tensor); virtgpu_shmem temp_shmem; // Local storage for large buffers - virtgpu_shmem * shmem = &temp_shmem; - bool using_shared_shmem = false; + virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (size <= gpu->data_shmem.mmap_size) { // Lock mutex before using shared data_shmem buffer @@ -44,7 +44,7 @@ void apir_buffer_set_tensor(virtgpu * gpu, GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); } using_shared_shmem = true; - shmem = &gpu->data_shmem; + shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, size, shmem)) { GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); @@ -86,8 +86,8 @@ void apir_buffer_get_tensor(virtgpu * gpu, apir_encode_ggml_tensor(encoder, tensor); virtgpu_shmem temp_shmem; // Local storage for large buffers - virtgpu_shmem * shmem = &temp_shmem; - bool using_shared_shmem = false; + virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (size <= gpu->data_shmem.mmap_size) { // Lock mutex before using shared data_shmem buffer @@ -95,7 +95,7 @@ void apir_buffer_get_tensor(virtgpu * gpu, GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); } using_shared_shmem = true; - shmem = &gpu->data_shmem; + shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, size, shmem)) { GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp index 4b6b8f527be..9f513c138dd 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp @@ -26,7 +26,7 @@ char * apir_device_get_name(virtgpu * gpu) { REMOTE_CALL(gpu, encoder, decoder, ret); const size_t string_size = apir_decode_array_size_unchecked(decoder); - char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); + char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); if (!string) { GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__); return NULL; @@ -173,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) { - GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer"); + GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate %ldb of guest-host shared buffer", __func__, size); } apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id); diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h index f23c75bb968..4d0b6e05c74 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h @@ -1,29 +1,36 @@ -#include "virtgpu.h" +#pragma once +// clang-format off +#include "virtgpu.h" #include "ggml-remoting.h" #include "backend/shared/apir_backend.h" #include "backend/shared/apir_cs_ggml.h" - #include "ggml-backend-impl.h" +// clang-format on -#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ - do { \ - int32_t forward_flag = (int32_t) apir_command_type__; \ - encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \ - if (!encoder_name) { \ - GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \ - } \ +#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ + int32_t REMOTE_CALL_PREPARE_forward_flag = (int32_t) apir_command_type__; \ + const char * REMOTE_CALL_PREPARE_command_name = apir_dispatch_command_name(apir_command_type__); \ + do { \ + encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, REMOTE_CALL_PREPARE_forward_flag); \ + if (!encoder_name) { \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \ + } \ } while (0) -#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \ - do { \ - ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \ - if (!decoder_name) { \ - GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \ - } \ - if (ret_name < APIR_FORWARD_BASE_INDEX) { \ - GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \ - apir_forward_error(ret_name), ret_name); \ - } \ - ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \ +#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \ + do { \ + ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \ + if (!decoder_name) { \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \ + } \ + if (ret_name < APIR_FORWARD_BASE_INDEX) { \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \ + apir_forward_error(ret_name), ret_name); \ + } \ + ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \ + if (ret_name != 0) { \ + GGML_ABORT(GGML_VIRTGPU "backend function '%s' failed (return code: %d)", \ + REMOTE_CALL_PREPARE_command_name, ret_name); \ + } \ } while (0) diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h index fe4cae20253..44b0ad1ffa1 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +++ b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h @@ -20,6 +20,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu, char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); +/* apir_buffer_type_is_host is deprecated. */ apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size); diff --git a/ggml/src/ggml-virtgpu/virtgpu.cpp b/ggml/src/ggml-virtgpu/virtgpu.cpp index 1e650dc65b2..a84a77399d9 100644 --- a/ggml/src/ggml-virtgpu/virtgpu.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu.cpp @@ -53,9 +53,9 @@ static int virtgpu_handshake(virtgpu * gpu) { if (!decoder) { GGML_ABORT(GGML_VIRTGPU - "%s: failed to initiate the communication with the virglrenderer library. " - "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", - __func__); + "%s: failed to initiate the communication with the virglrenderer library. " + "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", + __func__); return 1; } @@ -65,8 +65,7 @@ static int virtgpu_handshake(virtgpu * gpu) { uint32_t host_minor; if (ret_magic != APIR_HANDSHAKE_MAGIC) { - GGML_ABORT(GGML_VIRTGPU - "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic, + GGML_ABORT(GGML_VIRTGPU "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic, apir_backend_initialize_error(ret_magic)); } else { apir_decode_uint32_t(decoder, &host_major); @@ -140,15 +139,13 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { "Make sure virglrenderer is correctly configured by the hypervisor. (%s) ", __func__, apir_load_library_error(ret)); } else { - GGML_ABORT(GGML_VIRTGPU - "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__, - apir_load_library_error(ret), ret); + GGML_ABORT(GGML_VIRTGPU "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", + __func__, apir_load_library_error(ret), ret); } return ret; } - GGML_LOG_INFO(GGML_VIRTGPU - "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__); + GGML_LOG_INFO(GGML_VIRTGPU "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__); ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX); @@ -158,10 +155,11 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", __func__, apir_load_library_error(apir_ret)); } else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) { - GGML_ABORT(GGML_VIRTGPU - "%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. " - "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", - __func__, apir_load_library_error(apir_ret)); + GGML_ABORT( + GGML_VIRTGPU + "%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. " + "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", + __func__, apir_load_library_error(apir_ret)); } else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { GGML_ABORT(GGML_VIRTGPU "%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)", @@ -169,8 +167,8 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { } else { uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX; GGML_ABORT(GGML_VIRTGPU - "%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__, - lib_ret); + "%s: the API Remoting backend library failed to initialize its backend library: apir code=%d)", + __func__, lib_ret); } return ret; } @@ -184,55 +182,49 @@ virtgpu * create_virtgpu() { // Initialize mutex to protect shared data_shmem buffer if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) { delete gpu; - GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize data_shmem mutex", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize data_shmem mutex", __func__); return NULL; } if (virtgpu_open(gpu) != APIR_SUCCESS) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to open the virtgpu device\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to open the virtgpu device\n", __func__); return NULL; } if (virtgpu_init_capset(gpu) != APIR_SUCCESS) { if (gpu->use_apir_capset) { GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__); + "%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library " + "supports it.", + __func__); } else { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize the virtgpu Venus capset", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu Venus capset", __func__); } return NULL; } if (virtgpu_init_context(gpu) != APIR_SUCCESS) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize the GPU context", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the GPU context", __func__); return NULL; } if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to create the shared reply memory pages", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared reply memory pages", __func__); return NULL; } if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to create the shared data memory pages", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared data memory pages", __func__); return NULL; } if (virtgpu_handshake(gpu)) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to handshake with the virglrenderer library", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to handshake with the virglrenderer library", __func__); return NULL; } if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to load the backend library", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to load the backend library", __func__); return NULL; } @@ -243,8 +235,7 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) { drmDevicePtr devs[8]; int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); if (count < 0) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to enumerate DRM devices\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to enumerate DRM devices\n", __func__); return APIR_ERROR_INITIALIZATION_FAILED; } @@ -266,19 +257,17 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d int fd = open(node_path, O_RDWR | O_CLOEXEC); if (fd < 0) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to open %s", __func__, node_path); + GGML_ABORT(GGML_VIRTGPU "%s: failed to open %s", __func__, node_path); return APIR_ERROR_INITIALIZATION_FAILED; } drmVersionPtr version = drmGetVersion(fd); if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) { if (version) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: unknown DRM driver %s version %d\n", __func__, version->name, + version->version_major); } else { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to get DRM driver version\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get DRM driver version\n", __func__); } if (version) { @@ -322,9 +311,8 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) { virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data)); if (ret) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to get APIR v%d capset: %s\n", - __func__, gpu->capset.version, strerror(errno)); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get APIR v%d capset: %s\n", __func__, gpu->capset.version, + strerror(errno)); return APIR_ERROR_INITIALIZATION_FAILED; } @@ -547,13 +535,10 @@ static void log_call_duration(long long call_duration_ns, const char * name) { double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds if (call_duration_s > 1) { - GGML_LOG_INFO(GGML_VIRTGPU - "waited %.2fs for the %s host reply...\n", call_duration_s, name); + GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fs for the %s host reply...\n", call_duration_s, name); } else if (call_duration_ms > 1) { - GGML_LOG_INFO(GGML_VIRTGPU - "waited %.2fms for the %s host reply...\n", call_duration_ms, name); + GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fms for the %s host reply...\n", call_duration_ms, name); } else { - GGML_LOG_INFO(GGML_VIRTGPU - "waited %lldns for the %s host reply...\n", call_duration_ns, name); + GGML_LOG_INFO(GGML_VIRTGPU "waited %lldns for the %s host reply...\n", call_duration_ns, name); } } diff --git a/ggml/src/ggml-virtgpu/virtgpu.h b/ggml/src/ggml-virtgpu/virtgpu.h index 68e0f3a376e..f82d8fb50ba 100644 --- a/ggml/src/ggml-virtgpu/virtgpu.h +++ b/ggml/src/ggml-virtgpu/virtgpu.h @@ -1,5 +1,6 @@ #pragma once +// clang-format off #include "virtgpu-utils.h" #include "virtgpu-shm.h" #include "virtgpu-apir.h" @@ -23,20 +24,21 @@ #include "apir_hw.h" #include #include "venus_hw.h" +// clang-format on #ifndef VIRTGPU_DRM_CAPSET_APIR // Will be defined include/drm/virtgpu_drm.h when // https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs // is merged -#define VIRTGPU_DRM_CAPSET_APIR 10 +# define VIRTGPU_DRM_CAPSET_APIR 10 #endif // Mesa/Virlgrenderer Venus internal. Only necessary during the // Venus->APIR transition in Virglrenderer #define VENUS_COMMAND_TYPE_LENGTH 331 -#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16 -#define VIRTGPU_DRM_CAPSET_VENUS 4 +#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16 +# define VIRTGPU_DRM_CAPSET_VENUS 4 #endif typedef uint32_t virgl_renderer_capset; From b68d75165ad37ba1256cc45a43ec4f51cf813c3e Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 26 Feb 2026 21:01:08 +0800 Subject: [PATCH 7/7] llama: Add option to merge gate and exp weights (#19139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama: Add option to merge gate and exp weights * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * update constants.py * add gate_up for the all MoE models * convert: simplify merge tensor condition * update constants.py * reduce number of models, add create_tensor_gate_up helper --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 41 ++++++++++++++++++-- gguf-py/gguf/constants.py | 5 +++ gguf-py/gguf/tensor_mapping.py | 4 ++ src/llama-arch.cpp | 5 +++ src/llama-arch.h | 1 + src/llama-graph.cpp | 68 ++++++++++++++++++++++++---------- src/llama-graph.h | 7 +++- src/llama-model.cpp | 18 ++++++--- src/llama-model.h | 18 +++++---- src/models/deepseek2.cpp | 4 +- src/models/qwen35moe.cpp | 3 +- src/models/qwen3next.cpp | 3 +- 12 files changed, 134 insertions(+), 43 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5709cb0e3c1..09544173981 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -116,7 +116,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, - sentence_transformers_dense_modules: bool = False): + sentence_transformers_dense_modules: bool = False, + fuse_gate_up_exps: bool = False): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -135,6 +136,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.fuse_gate_up_exps = fuse_gate_up_exps + self._gate_exp_buffer: dict[int, Tensor] = {} + self._up_exp_buffer: dict[int, Tensor] = {} self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override @@ -512,8 +516,31 @@ def set_gguf_parameters(self): raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - return [(self.map_tensor_name(name), data_torch)] + new_name = self.map_tensor_name(name) + + # Handle gate/up expert tensor fusion if enabled + if self.fuse_gate_up_exps and bid is not None: + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid): + self._gate_exp_buffer[bid] = data_torch + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + self._up_exp_buffer[bid] = data_torch + + # Check if both gate and up are buffered for this layer + if bid in self._gate_exp_buffer and bid in self._up_exp_buffer: + gate_data = self._gate_exp_buffer.pop(bid) + up_data = self._up_exp_buffer.pop(bid) + # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd) + fused_data = torch.cat([gate_data, up_data], dim=1) + fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid) + logger.info(f"Fused gate_exps and up_exps for layer {bid}") + return [(fused_name, fused_data)] + + # If we buffered a gate/up tensor, wait for the other + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \ + self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + return [] + + return [(new_name, data_torch)] def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid, n_dims # unused @@ -11942,6 +11969,11 @@ def parse_args() -> argparse.Namespace: "Default these modules are not included.") ) + parser.add_argument( + "--fuse-gate-up-exps", action="store_true", + help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.", + ) + args = parser.parse_args() if not args.print_supported_models and args.model is None: parser.error("the following arguments are required: model") @@ -12079,7 +12111,8 @@ def main() -> None: split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, - sentence_transformers_dense_modules=args.sentence_transformers_dense_modules + sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + fuse_gate_up_exps=args.fuse_gate_up_exps ) if args.vocab_only: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e9ef97d553c..839c6e787fc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -532,6 +532,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() + FFN_GATE_UP_EXP = auto() FFN_GATE_SHEXP = auto() FFN_DOWN_SHEXP = auto() FFN_UP_SHEXP = auto() @@ -980,6 +981,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps", MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n @@ -1820,6 +1822,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.SSM_A, MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.SSM_DT, @@ -1909,6 +1912,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.SSM_A, MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.SSM_DT, @@ -2610,6 +2614,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index fc468d07745..e575610900e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -567,6 +567,10 @@ class TensorNameMap: "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe ), + MODEL_TENSOR.FFN_GATE_UP_EXP: ( + "model.layers.{bid}.mlp.experts.gate_up_proj", + ), + # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 106ea133ac0..47e8d5278ac 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -349,6 +349,7 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, @@ -1004,6 +1005,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -1061,6 +1063,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -1601,6 +1604,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -2685,6 +2689,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_GATE_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 42a6ea38f38..6d1b1df31c0 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -373,6 +373,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_EXPS, // merged experts LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_UP_SHEXP, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index dc58c0826ae..23a86ea2905 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1165,7 +1165,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in) const { + ggml_tensor * probs_in, + ggml_tensor * gate_up_exps) const { return build_moe_ffn( cur, gate_inp, /* gate_inp_b */ nullptr, @@ -1181,7 +1182,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( w_scale, gating_op, il, - probs_in + probs_in, + gate_up_exps ); } @@ -1204,7 +1206,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in) const { + ggml_tensor * probs_in, + ggml_tensor * gate_up_exps, + ggml_tensor * gate_up_exps_b) const { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN @@ -1343,27 +1347,49 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_weighted", il); } - ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); + ggml_tensor * up = nullptr; + ggml_tensor * experts = nullptr; - if (up_exps_b) { - up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); - cb(up, "ffn_moe_up_biased", il); - } + if (gate_up_exps) { + // merged gate_up path: one mul_mat_id, then split into gate and up views + ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens] + cb(gate_up, "ffn_moe_gate_up", il); - ggml_tensor * experts = nullptr; - if (gate_exps) { - cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + if (gate_up_exps_b) { + gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts); + cb(gate_up, "ffn_moe_gate_up_biased", il); + } + + const int64_t n_ff = gate_up->ne[0] / 2; + cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0); cb(cur, "ffn_moe_gate", il); + up = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], n_ff * gate_up->nb[0]); + cb(up, "ffn_moe_up", il); } else { - cur = up; - } + // separate gate and up path + up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + if (up_exps_b) { + up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); + cb(up, "ffn_moe_up_biased", il); + } - if (gate_exps_b) { - cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); - cb(cur, "ffn_moe_gate_biased", il); + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } + + if (gate_exps_b) { + cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); + cb(cur, "ffn_moe_gate_biased", il); + } } + const bool has_gate = gate_exps || gate_up_exps; + switch (type_op) { case LLM_FFN_SILU: if (gate_exps) { @@ -1385,7 +1411,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( break; } } + } + if (has_gate) { cur = ggml_swiglu_split(ctx0, cur, up); cb(cur, "ffn_moe_swiglu", il); } else { @@ -1393,7 +1421,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: - if (gate_exps) { + if (has_gate) { cur = ggml_geglu_split(ctx0, cur, up); cb(cur, "ffn_moe_geglu", il); } else { @@ -1409,7 +1437,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_swiglu_oai", il); } break; case LLM_FFN_RELU: - if (gate_exps) { + if (has_gate) { cur = ggml_reglu_split(ctx0, cur, up); cb(cur, "ffn_moe_reglu", il); } else { @@ -1417,7 +1445,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_relu", il); } break; case LLM_FFN_RELU_SQR: - if (gate_exps) { + if (has_gate) { // TODO: add support for gated squared relu GGML_ABORT("fatal error: gated squared relu not implemented"); } else { diff --git a/src/llama-graph.h b/src/llama-graph.h index 22d11a83857..e8f006977d2 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -814,7 +814,8 @@ struct llm_graph_context { float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in = nullptr) const; + ggml_tensor * probs_in = nullptr, + ggml_tensor * gate_up_exps = nullptr) const; ggml_tensor * build_moe_ffn( ggml_tensor * cur, @@ -835,7 +836,9 @@ struct llm_graph_context { float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in = nullptr) const; + ggml_tensor * probs_in = nullptr, + ggml_tensor * gate_up_exps = nullptr, + ggml_tensor * gate_up_exps_b = nullptr) const; // // inputs diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c8ef1b5e7c2..dabf3b3086e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2980,6 +2980,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // TODO: move to a separate function const auto tn = LLM_TN(arch); + + // helper: try merged gate_up_exps first, fall back to separate gate and up + auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) { + layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED); + if (layer.ffn_gate_up_exps == nullptr) { + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); + } + }; switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: @@ -5221,9 +5230,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // MoE branch - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared expert branch layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); @@ -7425,9 +7433,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared experts layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); @@ -7491,9 +7498,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared experts const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; diff --git a/src/llama-model.h b/src/llama-model.h index 96e407a0b30..d7c3e7d1c1a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -280,14 +280,16 @@ struct llama_layer { struct ggml_tensor * ffn_up_enc = nullptr; // ff MoE - struct ggml_tensor * ffn_gate_inp = nullptr; - struct ggml_tensor * ffn_gate_exps = nullptr; - struct ggml_tensor * ffn_down_exps = nullptr; - struct ggml_tensor * ffn_up_exps = nullptr; - struct ggml_tensor * ffn_gate_inp_b = nullptr; - struct ggml_tensor * ffn_gate_exps_b = nullptr; - struct ggml_tensor * ffn_down_exps_b = nullptr; - struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_inp = nullptr; + struct ggml_tensor * ffn_gate_exps = nullptr; + struct ggml_tensor * ffn_down_exps = nullptr; + struct ggml_tensor * ffn_up_exps = nullptr; + struct ggml_tensor * ffn_gate_up_exps = nullptr; + struct ggml_tensor * ffn_gate_inp_b = nullptr; + struct ggml_tensor * ffn_gate_exps_b = nullptr; + struct ggml_tensor * ffn_down_exps_b = nullptr; + struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_up_exps_b = nullptr; // ff shared expert (shexp) struct ggml_tensor * ffn_gate_inp_shexp = nullptr; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index b2c1f160601..b608396e50e 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -218,7 +218,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr LLM_FFN_SILU, hparams.expert_weights_norm, hparams.expert_weights_scale, hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, - il); + il, + nullptr, + model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // FFN shared expert diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 77f18b5aeb8..22d708f2062 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -380,7 +380,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // Add shared experts if present - following Qwen3Next reference implementation diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 9d3a68cfe5e..f2621200f23 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -479,7 +479,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // Add shared experts if present - following Qwen3Next reference implementation