From a8b192b6ec4fd517fc966ef6fa02c01f8157bbee Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Fri, 27 Feb 2026 05:37:54 +0100 Subject: [PATCH 1/2] tests : enable test-chat out of tree build (#19558) The binary relies on model files that it tries to find. However, when configuring the build directory to be parallel to the source tree those heuristics fail. This sets the working directory for the test executable to be the source-tree which resolves this issue. --- tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 350bffc3157..d98a090c327 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -152,7 +152,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS) llama_build_and_test(test-grammar-parser.cpp) llama_build_and_test(test-grammar-integration.cpp) llama_build_and_test(test-llama-grammar.cpp) - llama_build_and_test(test-chat.cpp) + llama_build_and_test(test-chat.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) From 2e7e6385230ede73135191e83c81cc9f138c2d65 Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 27 Feb 2026 07:05:23 +0100 Subject: [PATCH 2/2] server : support multiple model aliases via comma-separated --alias (#19926) * server : support multiple model aliases via comma-separated --alias * server : update --alias description and regenerate docs * server : multiple model aliases and tags - address review feedback from ngxson - --alias accepts comma-separated values (std::set, no duplicates) - --tags for informational metadata (not used for routing) - aliases resolve transparently in router via get_meta/has_model - /v1/models exposes aliases and tags fields * regenerate docs * nits * server : use first alias as model_name for backward compat address review feedback from ngxson * server : add single-model test for aliases and tags --- common/arg.cpp | 21 +++++- common/common.h | 3 +- tools/cli/README.md | 10 +-- tools/completion/README.md | 10 +-- tools/server/README.md | 21 ++++-- tools/server/server-context.cpp | 14 +++- tools/server/server-context.h | 3 + tools/server/server-models.cpp | 99 ++++++++++++++++++++++++--- tools/server/server-models.h | 2 + tools/server/server.cpp | 2 +- tools/server/tests/unit/test_basic.py | 17 +++++ tools/server/tests/utils.py | 3 + 12 files changed, 173 insertions(+), 32 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1e8885c9ca5..05f4a5244e7 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2520,11 +2520,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", + "set model name aliases, comma-separated (to be used by API)", [](common_params & params, const std::string & value) { - params.model_alias = value; + for (auto & alias : string_split(value, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + params.model_alias.insert(alias); + } + } } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); + add_opt(common_arg( + {"--tags"}, "STRING", + "set model tags, comma-separated (informational, not used for routing)", + [](common_params & params, const std::string & value) { + for (auto & tag : string_split(value, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + params.model_tags.insert(tag); + } + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS")); add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA diff --git a/common/common.h b/common/common.h index 1fa17286562..c5a80375713 100644 --- a/common/common.h +++ b/common/common.h @@ -410,7 +410,8 @@ struct common_params { struct common_params_model model; - std::string model_alias = ""; // model alias // NOLINT + std::set model_alias; // model aliases // NOLINT + std::set model_tags; // model tags (informational, not used for routing) // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT diff --git a/tools/cli/README.md b/tools/cli/README.md index 4a15cbad9d7..22d3fc87e96 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -57,8 +57,8 @@ | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -109,14 +109,14 @@ | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 3ca3e684541..bcc08876592 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -140,8 +140,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -192,14 +192,14 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/server/README.md b/tools/server/README.md index 34b722a27c5..da16ddc756e 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -74,8 +74,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -126,14 +126,14 @@ For the full list of features, please refer to [server's changelog](https://gith | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | @@ -162,9 +162,11 @@ For the full list of features, please refer to [server's changelog](https://gith | Argument | Explanation | | -------- | ----------- | +| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | +| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | -| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | +| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | @@ -182,7 +184,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | | `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | -| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) | +| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | +| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | @@ -229,6 +232,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | +| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | +| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) | | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) | | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall | | `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index eba463e4dac..aafed495020 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -580,6 +580,8 @@ struct server_context_impl { float slot_prompt_similarity = 0.0f; std::string model_name; // name of the loaded model, to be used by API + std::set model_aliases; // additional names for the model + std::set model_tags; // informational tags bool sleeping = false; @@ -813,10 +815,9 @@ struct server_context_impl { SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); if (!params_base.model_alias.empty()) { - // user explicitly specified model name - model_name = params_base.model_alias; + // backward compat: use first alias as model name + model_name = *params_base.model_alias.begin(); } else if (!params_base.model.name.empty()) { - // use model name in registry format (for models in cache) model_name = params_base.model.name; } else { // fallback: derive model name from file name @@ -824,6 +825,9 @@ struct server_context_impl { model_name = model_path.filename().string(); } + model_aliases = params_base.model_alias; + model_tags = params_base.model_tags; + if (!is_resume) { return init(); } @@ -2892,6 +2896,8 @@ server_context_meta server_context::get_meta() const { return server_context_meta { /* build_info */ build_info, /* model_name */ impl->model_name, + /* model_aliases */ impl->model_aliases, + /* model_tags */ impl->model_tags, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, @@ -3688,6 +3694,8 @@ void server_routes::init_routes() { {"data", { { {"id", meta->model_name}, + {"aliases", meta->model_aliases}, + {"tags", meta->model_tags}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 03c29f513bf..631d573fcbd 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -6,12 +6,15 @@ #include #include +#include struct server_context_impl; // private implementation struct server_context_meta { std::string build_info; std::string model_name; + std::set model_aliases; + std::set model_tags; std::string model_path; bool has_mtmd; bool has_inp_image; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index e1625477993..bc601237b7d 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -184,6 +184,51 @@ void server_models::add_model(server_model_meta && meta) { if (mapping.find(meta.name) != mapping.end()) { throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str())); } + + // check model name does not conflict with existing aliases + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(meta.name)) { + throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'", + meta.name.c_str(), key.c_str())); + } + } + + // parse aliases from preset's --alias option (comma-separated) + std::string alias_str; + if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) { + for (auto & alias : string_split(alias_str, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + meta.aliases.insert(alias); + } + } + } + + // parse tags from preset's --tags option (comma-separated) + std::string tags_str; + if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) { + for (auto & tag : string_split(tags_str, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + meta.tags.insert(tag); + } + } + } + + // validate aliases do not conflict with existing names or aliases + for (const auto & alias : meta.aliases) { + if (mapping.find(alias) != mapping.end()) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name", + alias.c_str(), meta.name.c_str())); + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(alias)) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'", + alias.c_str(), meta.name.c_str(), key.c_str())); + } + } + } + meta.update_args(ctx_preset, bin_path); // render args std::string name = meta.name; mapping[name] = instance_t{ @@ -249,6 +294,8 @@ void server_models::load_models() { server_model_meta meta{ /* preset */ preset.second, /* name */ preset.first, + /* aliases */ {}, + /* tags */ {}, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, @@ -265,10 +312,28 @@ void server_models::load_models() { for (const auto & [name, preset] : custom_presets) { custom_names.insert(name); } + auto join_set = [](const std::set & s) { + std::string result; + for (const auto & v : s) { + if (!result.empty()) { + result += ", "; + } + result += v; + } + return result; + }; + SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { bool has_custom = custom_names.find(name) != custom_names.end(); - SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); + std::string info; + if (!inst.meta.aliases.empty()) { + info += " (aliases: " + join_set(inst.meta.aliases) + ")"; + } + if (!inst.meta.tags.empty()) { + info += " [tags: " + join_set(inst.meta.tags) + "]"; + } + SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str()); } } @@ -320,7 +385,15 @@ void server_models::update_meta(const std::string & name, const server_model_met bool server_models::has_model(const std::string & name) { std::lock_guard lk(mutex); - return mapping.find(name) != mapping.end(); + if (mapping.find(name) != mapping.end()) { + return true; + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return true; + } + } + return false; } std::optional server_models::get_meta(const std::string & name) { @@ -329,6 +402,11 @@ std::optional server_models::get_meta(const std::string & nam if (it != mapping.end()) { return it->second.meta; } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return inst.meta; + } + } return std::nullopt; } @@ -766,7 +844,7 @@ static void res_err(std::unique_ptr & res, const json & error_d res->data = safe_json_to_str({{ "error", error_data }}); } -static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { +static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { if (name.empty()) { res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); return false; @@ -776,6 +854,8 @@ static bool router_validate_model(const std::string & name, server_models & mode res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST)); return false; } + // resolve alias to canonical model name + name = meta->name; if (models_autoload) { models.ensure_model_loaded(name); } else { @@ -847,16 +927,16 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - auto model = models.get_meta(name); - if (!model.has_value()) { + auto meta = models.get_meta(name); + if (!meta.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } - if (model->status == SERVER_MODEL_STATUS_LOADED) { + if (meta->status == SERVER_MODEL_STATUS_LOADED) { res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.load(name); + models.load(meta->name); res_ok(res, {{"success", true}}); return res; }; @@ -877,6 +957,7 @@ void server_models_routes::init_routes() { preset_copy.unset_option("LLAMA_ARG_HOST"); preset_copy.unset_option("LLAMA_ARG_PORT"); preset_copy.unset_option("LLAMA_ARG_ALIAS"); + preset_copy.unset_option("LLAMA_ARG_TAGS"); status["preset"] = preset_copy.to_ini(); } if (meta.is_failed()) { @@ -885,6 +966,8 @@ void server_models_routes::init_routes() { } models_json.push_back(json { {"id", meta.name}, + {"aliases", meta.aliases}, + {"tags", meta.tags}, {"object", "model"}, // for OAI-compat {"owned_by", "llamacpp"}, // for OAI-compat {"created", t}, // for OAI-compat @@ -912,7 +995,7 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.unload(name); + models.unload(model->name); res_ok(res, {{"success", true}}); return res; }; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index a397abda4a8..78abc8d72a7 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -52,6 +52,8 @@ static std::string server_model_status_to_string(server_model_status status) { struct server_model_meta { common_preset preset; std::string name; + std::set aliases; // additional names that resolve to this model + std::set tags; // informational tags, not used for routing int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d3d4316026a..542b984534c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { // for consistency between server router mode and single-model mode, we set the same model name as alias if (params.model_alias.empty() && !params.model.name.empty()) { - params.model_alias = params.model.name; + params.model_alias.insert(params.model.name); } common_init(); diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index 3405be3e25d..d1b89cf1a91 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -94,3 +94,20 @@ def test_no_webui(): server.start() res = requests.get(url) assert res.status_code == 404 + + +def test_server_model_aliases_and_tags(): + global server + server.model_alias = "tinyllama-2,fim,code" + server.model_tags = "chat,fim,small" + server.start() + res = server.make_request("GET", "/models") + assert res.status_code == 200 + assert len(res.body["data"]) == 1 + model = res.body["data"][0] + # aliases field must contain all aliases + assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"} + # tags field must contain all tags + assert set(model["tags"]) == {"chat", "fim", "small"} + # id is derived from first alias (alphabetical order from std::set) + assert model["id"] == "code" diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index f76bb1a9115..5002999d9b3 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -56,6 +56,7 @@ class ServerProcess: # custom options model_alias: str | None = None + model_tags: str | None = None model_url: str | None = None model_file: str | None = None model_draft: str | None = None @@ -180,6 +181,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) + if self.model_tags: + server_args.extend(["--tags", self.model_tags]) if self.n_ctx: server_args.extend(["--ctx-size", self.n_ctx]) if self.n_slots: