LongLeCE · pull · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp
@@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) {
         int64_t arr_size = 0;
         if (is_val<value_array>(object)) {
             arr_size = object->as_array().size();
+        } else if (is_val<value_string>(object)) {
+            arr_size = object->as_string().length();
         }
 
         if (is_stmt<slice_expression>(this->property)) {

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -116,7 +116,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
                  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
                  small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
                  disable_mistral_community_chat_template: bool = False,
-                 sentence_transformers_dense_modules: bool = False):
+                 sentence_transformers_dense_modules: bool = False,
+                 fuse_gate_up_exps: bool = False):
         if type(self) is ModelBase or \
                 type(self) is TextModel or \
                 type(self) is MmprojModel:
@@ -135,6 +136,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.dry_run = dry_run
         self.remote_hf_model_id = remote_hf_model_id
         self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
+        self.fuse_gate_up_exps = fuse_gate_up_exps
+        self._gate_exp_buffer: dict[int, Tensor] = {}
+        self._up_exp_buffer: dict[int, Tensor] = {}
         self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
         self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
         self.metadata_override = metadata_override
@@ -512,8 +516,31 @@ def set_gguf_parameters(self):
         raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid # unused
-        return [(self.map_tensor_name(name), data_torch)]
+        new_name = self.map_tensor_name(name)
+
+        # Handle gate/up expert tensor fusion if enabled
+        if self.fuse_gate_up_exps and bid is not None:
+            if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid):
+                self._gate_exp_buffer[bid] = data_torch
+            elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid):
+                self._up_exp_buffer[bid] = data_torch
+
+            # Check if both gate and up are buffered for this layer
+            if bid in self._gate_exp_buffer and bid in self._up_exp_buffer:
+                gate_data = self._gate_exp_buffer.pop(bid)
+                up_data = self._up_exp_buffer.pop(bid)
+                # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd)
+                fused_data = torch.cat([gate_data, up_data], dim=1)
+                fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid)
+                logger.info(f"Fused gate_exps and up_exps for layer {bid}")
+                return [(fused_name, fused_data)]
+
+            # If we buffered a gate/up tensor, wait for the other
+            if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \
+               self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid):
+                return []
+
+        return [(new_name, data_torch)]
 
     def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
         del name, new_name, bid, n_dims  # unused
@@ -1148,6 +1175,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
             res = "jina-v2-de"
+        if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano
+            res = "jina-v5-nano"
         if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
             # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
             res = "smaug-bpe"
@@ -6125,6 +6155,32 @@ def modify_tensors(self, data_torch, name, bid):
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model")
+class EuroBertModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.EUROBERT
+
+    def set_vocab(self):
+        self.gguf_writer.add_add_bos_token(False)
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # EuroBert is bidirectional (encoder)
+        self.gguf_writer.add_causal_attention(False)
+
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+        self._try_set_pooling_type()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Strip "model." prefix from tensor names
+        if name.startswith("model."):
+            name = name[6:]
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
@@ -11913,6 +11969,11 @@ def parse_args() -> argparse.Namespace:
               "Default these modules are not included.")
     )
 
+    parser.add_argument(
+        "--fuse-gate-up-exps", action="store_true",
+        help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.",
+    )
+
     args = parser.parse_args()
     if not args.print_supported_models and args.model is None:
         parser.error("the following arguments are required: model")
@@ -12050,7 +12111,8 @@ def main() -> None:
                                      split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
                                      small_first_shard=args.no_tensor_first_split,
                                      remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
-                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
+                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
+                                     fuse_gate_up_exps=args.fuse_gate_up_exps
                                      )
 
         if args.vocab_only:

diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -107,6 +107,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
     {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
     {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v5-nano",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", },
     {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
     {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },

diff --git a/docs/backend/VirtGPU.md b/docs/backend/VirtGPU.md
@@ -152,7 +152,9 @@ Commands and data are serialized using a custom binary protocol with:
 - **VM-specific**: Only works in virtual machines with virtio-gpu support
 - **Host dependency**: Requires properly configured host-side backend
 - **Latency**: Small overhead from VM escaping for each operation
-
+- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM
+  addressable memory is limited to 64 GB. So the maximum GPU memory
+  will be `64GB - RAM`, regardless of the hardware VRAM size.
 
 * This work is pending upstream changes in the VirglRenderer
   project.

diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp
@@ -7,9 +7,21 @@
 
 #include <cstdint>
 
+static uint32_t validate_graph_operation(size_t cgraph_size, uint32_t shmem_res_id, const char * operation) {
+    if (cgraph_size == 0) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Zero-size computation graph\n", operation);
+        return 1;
+    }
+
+    // place-holder: validate that the size of shmem_res_id is <= cgraph_size
+    // need to add another method in the Virgl->APIR callback interface
+    GGML_UNUSED(shmem_res_id);
+
+    return 0;  // Valid
+}
+
 uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
     GGML_UNUSED(ctx);
-    GGML_UNUSED(enc);
 
     static bool async_backend_initialized = false;
     static bool async_backend;
@@ -34,28 +46,53 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
     size_t cgraph_size;
     apir_decode_size_t(dec, &cgraph_size);
 
+    if (validate_graph_operation(cgraph_size, shmem_res_id, __func__) != 0) {
+        apir_decoder_set_fatal(dec);
+        return 1;
+    }
+
     apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size);
 
     ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size);
 
+    if (!cgraph || apir_decoder_get_fatal(&secondary_dec)) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to deserialize computation graph\n", __func__);
+        return 1;
+    }
+
+    if (cgraph->n_nodes < 0 || cgraph->n_leafs < 0) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid negative node/leaf count: nodes=%d leafs=%d\n", __func__,
+                       cgraph->n_nodes, cgraph->n_leafs);
+        return 1;
+    }
+
     ggml_status status;
 #if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
     for (int idx = 0; idx < cgraph->n_nodes; idx++) {
         ggml_tensor * op = ggml_graph_node(cgraph, idx);
         if (dev->iface.supports_op(dev, op)) {
             continue;
         }
-        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", __func__, idx,
+                       ggml_op_desc(op));
 
         status = GGML_STATUS_ABORTED;
         apir_encode_ggml_status(enc, &status);
 
         return 0;
     }
 #endif
+
+    // Check if backend is properly initialized
+    if (!bck) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Backend not initialized (bck is null)\n", __func__);
+
+        return 1;
+    }
+
     status = bck->iface.graph_compute(bck, cgraph);
 
-    if (async_backend) {
+    if (async_backend && bck->iface.synchronize) {
         bck->iface.synchronize(bck);
     }
 

diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp
@@ -85,7 +85,19 @@ uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * d
 
     const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
 
-    size_t value = buft->iface.get_alloc_size(buft, op);
+    // Check for decode error
+    if (op == nullptr) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to decode tensor\n", __func__);
+        apir_decoder_set_fatal(dec);
+        return 1;
+    }
+
+    size_t value;
+    if (buft->iface.get_alloc_size) {
+        value = buft->iface.get_alloc_size(buft, op);
+    } else {
+        value = ggml_nbytes(op);  // Default fallback
+    }
 
     apir_encode_size_t(enc, &value);
 

diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp
@@ -6,11 +6,26 @@
 
 #include <cstdint>
 
+static uint32_t validate_buffer_operation(size_t offset, size_t size, const char * operation) {
+    // Only check for critical integer overflow - no arbitrary size limits
+    if (offset > SIZE_MAX - size) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Integer overflow in offset+size: %zu + %zu\n", operation, offset, size);
+        return 1;
+    }
+
+    return 0;  // Valid
+}
+
 uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
     GGML_UNUSED(ctx);
     ggml_backend_buffer_t buffer;
     buffer = apir_decode_ggml_buffer(dec);
 
+    if (!buffer || apir_decoder_get_fatal(dec)) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
+        return 1;
+    }
+
     uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
     apir_encode_uintptr_t(enc, &base);
 
@@ -24,6 +39,11 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
     ggml_backend_buffer_t buffer;
     buffer = apir_decode_ggml_buffer(dec);
 
+    if (!buffer || apir_decoder_get_fatal(dec)) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
+        return 1;
+    }
+
     ggml_tensor * tensor;
     // safe to remove the const qualifier here
     tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
@@ -37,6 +57,10 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
     size_t size;
     apir_decode_size_t(dec, &size);
 
+    if (validate_buffer_operation(offset, size, __func__) != 0) {
+        return 1;
+    }
+
     void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
 
     if (!shmem_data) {
@@ -56,6 +80,11 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
     ggml_backend_buffer_t buffer;
     buffer = apir_decode_ggml_buffer(dec);
 
+    if (!buffer || apir_decoder_get_fatal(dec)) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
+        return 1;
+    }
+
     const ggml_tensor * tensor;
     // safe to remove the const qualifier here
     tensor = apir_decode_ggml_tensor(dec);
@@ -69,6 +98,10 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
     size_t size;
     apir_decode_size_t(dec, &size);
 
+    if (validate_buffer_operation(offset, size, __func__) != 0) {
+        return 1;
+    }
+
     void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
     if (!shmem_data) {
         GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
@@ -86,6 +119,11 @@ uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl
     ggml_backend_buffer_t buffer;
     buffer = apir_decode_ggml_buffer(dec);
 
+    if (!buffer || apir_decoder_get_fatal(dec)) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
+        return 1;
+    }
+
     const ggml_tensor * src;
     // safe to remove the const qualifier here
     src               = apir_decode_ggml_tensor(dec);
@@ -105,6 +143,11 @@ uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir
     ggml_backend_buffer_t buffer;
     buffer = apir_decode_ggml_buffer(dec);
 
+    if (!buffer || apir_decoder_get_fatal(dec)) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
+        return 1;
+    }
+
     uint8_t value;
     apir_decode_uint8_t(dec, &value);
 
@@ -120,6 +163,11 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
     ggml_backend_buffer_t buffer;
     buffer = apir_decode_ggml_buffer(dec);
 
+    if (!buffer || apir_decoder_get_fatal(dec)) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
+        return 1;
+    }
+
     if (!apir_untrack_backend_buffer(buffer)) {
         GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer);
         return 1;

diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp
@@ -1,6 +1,6 @@
 #include "backend-dispatched.h"
-#include "backend-virgl-apir.h"
 
+#include "backend-virgl-apir.h"
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 #include "ggml-impl.h"
@@ -28,19 +28,24 @@ uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
         return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
     }
 
-    if (!reg->iface.get_device_count(reg)) {
-        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__);
+    size_t device_count = reg->iface.get_device_count(reg);
+    if (!device_count) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: no device found\n", __func__);
         return APIR_BACKEND_INITIALIZE_NO_DEVICE;
     }
 
     dev = reg->iface.get_device(reg, 0);
 
     if (!dev) {
-        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__);
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: failed to get device\n", __func__);
         return APIR_BACKEND_INITIALIZE_NO_DEVICE;
     }
 
     bck = dev->iface.init_backend(dev, NULL);
+    if (!bck) {
+        GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed\n", __func__);
+        return APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED;
+    }
 
     return APIR_BACKEND_INITIALIZE_SUCCESS;
 }