From 44084941448d841c9b12ad250da5619cddb58bd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 16 Feb 2026 16:06:48 +0100 Subject: [PATCH 1/5] build : rework llama_option_depr to handle LLAMA_CURL (#19658) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- CMakeLists.txt | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d10ab6da961..32542ecd27a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,11 +115,6 @@ option(LLAMA_TESTS_INSTALL "llama: install tests" ON) option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) -# deprecated -option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) -if (LLAMA_CURL) - message(WARNING "LLAMA_CURL option is deprecated and will be ignored") -endif() # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) @@ -147,10 +142,15 @@ if (NOT DEFINED GGML_CUDA_GRAPHS) endif() # transition helpers -function (llama_option_depr TYPE OLD NEW) +function (llama_option_depr TYPE OLD) if (${OLD}) - message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") - set(${NEW} ON PARENT_SCOPE) + set(NEW "${ARGV2}") + if(NEW) + message(${TYPE} "${OLD} is deprecated, use ${NEW} instead") + set(${NEW} ON PARENT_SCOPE) + else() + message(${TYPE} "${OLD} is deprecated and will be ignored") + endif() endif() endfunction() @@ -163,6 +163,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) +llama_option_depr(WARNING LLAMA_CURL) include("cmake/license.cmake") license_add_file("llama.cpp" "LICENSE") From 5f28c53d11210f3521328d6dac620c4b6ae0044b Mon Sep 17 00:00:00 2001 From: Saurabh Dash <111897126+saurabhdash2512@users.noreply.github.com> Date: Mon, 16 Feb 2026 10:28:46 -0500 Subject: [PATCH 2/5] model: Add support for Tiny Aya Models (#19611) * changes for tiny aya * changes to hash * changes to vocab * fix some tokenizer regex edge cases * update comment * add some comments for regex * Apply suggestion from @ngxson --------- Co-authored-by: Xuan-Son Nguyen --- convert_hf_to_gguf.py | 14 ++++++++++++++ convert_hf_to_gguf_update.py | 1 + src/llama-vocab.cpp | 16 ++++++++++++++-- src/llama-vocab.h | 1 + src/unicode.cpp | 6 ++++++ 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0f614e4df3d..d7141f01cf0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1124,6 +1124,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 res = "command-r" + if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1": + # ref: https://huggingface.co/CohereLabs/tiny-aya-base + res = "tiny_aya" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" @@ -7360,6 +7363,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Cohere2 runtime in llama.cpp expects no bias tensors; + # the actual weight only contains 0-value tensors as bias, we can skip them + if name.endswith(".bias"): + if torch.any(data_torch != 0): + raise ValueError(f"Bias tensor {name!r} is not zero.") + logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.") + return + + yield from super().modify_tensors(data_torch, name, bid) + @ModelBase.register("OlmoForCausalLM") @ModelBase.register("OLMoForCausalLM") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index a6834515080..8bd24dbe917 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -99,6 +99,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, + {"name": "tiny_aya", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 62e137fb842..b35cb02ce4d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -422,6 +422,14 @@ struct llm_tokenizer_bpe : llm_tokenizer { "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_TINY_AYA: + regex_exprs = { + // original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)" + "\\d{1,3}(?=(?:\\d{3})*\\b)", + // original regex from tokenizer.json: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_KIMI_K2: regex_exprs = { // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp @@ -2005,10 +2013,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "megrez") { pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; } else if ( - tokenizer_pre == "gpt-4o" || - tokenizer_pre == "llama4") { + tokenizer_pre == "gpt-4o" || + tokenizer_pre == "llama4") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; clean_spaces = false; + } else if ( + tokenizer_pre == "tiny_aya") { + pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA; + clean_spaces = false; } else if ( tokenizer_pre == "superbpe") { pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 718238fb866..1312a877ab0 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -55,6 +55,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, + LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47, }; struct LLM_KV; diff --git a/src/unicode.cpp b/src/unicode.cpp index b88d953bd27..1475b53b659 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -769,6 +769,12 @@ static std::vector unicode_regex_split_custom(const std::string & text, } else if (regex_expr == "\\p{AFMoE_digits}") { // AFMOE digit pattern - use custom implementation for proper splitting bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets); + } else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") { + // tiny_aya digit grouping pattern from tokenizer.json: + // {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"} + // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567) + // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex. + bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets); } return bpe_offsets; From d23a55997de9f42754e02f9022fefd2e4d41f06f Mon Sep 17 00:00:00 2001 From: Judd <4046440+foldl@users.noreply.github.com> Date: Mon, 16 Feb 2026 23:43:34 +0800 Subject: [PATCH 3/5] ggml : make `ggml_is_view` as API (#19539) * make `ggml_is_view` as API * introduce `ggml_aux_is_view` as inline version for internal use. * change `ggml_aux_is_view` to `ggml_impl_is_view` --- ggml/include/ggml.h | 1 + ggml/src/ggml-alloc.c | 13 ++++--------- ggml/src/ggml-impl.h | 4 ++++ ggml/src/ggml.c | 4 ++++ 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f759e2d5883..77af0e7fb6a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -752,6 +752,7 @@ extern "C" { GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_view (const struct ggml_tensor * tensor); GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 41419b617bd..7f414b2311c 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -17,11 +17,6 @@ //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__) #define AT_PRINTF(...) - -static bool ggml_is_view(const struct ggml_tensor * t) { - return t->view_src != NULL; -} - // ops that return true for this function must not use restrict pointers for their backend implementations bool ggml_op_can_inplace(enum ggml_op op) { switch (op) { @@ -627,7 +622,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor GGML_ASSERT(buffer_id >= 0); struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) { + if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_impl_is_view(node)) { hn->allocated = true; assert(hn->addr.offset == 0); @@ -658,7 +653,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); if (p_hn->n_children == 1 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { + if (ggml_impl_is_view(parent)) { struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { @@ -739,7 +734,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node // itself is never used and should not be considered a dependency - if (ggml_is_view(node) && node->op != GGML_OP_NONE) { + if (ggml_impl_is_view(node) && node->op != GGML_OP_NONE) { struct ggml_tensor * view_src = node->view_src; ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } @@ -806,7 +801,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated); if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { + if (ggml_impl_is_view(parent)) { struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); view_src_hn->n_views -= 1; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index baadfe9a7b3..e3714b38a6a 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -98,6 +98,10 @@ static bool ggml_op_is_empty(enum ggml_op op) { } } +static inline bool ggml_impl_is_view(const struct ggml_tensor * t) { + return t->view_src != NULL; +} + static inline float ggml_compute_softplus_f32(float input) { return (input > 20.0f) ? input : logf(1 + expf(input)); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e2a6ff67be7..ed819eaa4c5 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1496,6 +1496,10 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso (t0->nb[3] == t1->nb[3]); } +bool ggml_is_view(const struct ggml_tensor * t) { + return ggml_impl_is_view(t); +} + // check if t1 can be represented as a repetition of t0 bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); From cceb1b4e33cfd9595b4ac1949f2c0857e43af427 Mon Sep 17 00:00:00 2001 From: Ivan Chikish Date: Mon, 16 Feb 2026 18:52:24 +0300 Subject: [PATCH 4/5] common : inline functions (#18639) --- common/common.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/common.h b/common/common.h index 804485fb198..64102483777 100644 --- a/common/common.h +++ b/common/common.h @@ -670,7 +670,7 @@ static std::vector string_split(const std::string & str, char delim) { } template<> -std::vector string_split(const std::string & input, char separator) +inline std::vector string_split(const std::string & input, char separator) { std::vector parts; size_t begin_pos = 0; @@ -685,7 +685,7 @@ std::vector string_split(const std::string & input, ch return parts; } -static bool string_starts_with(const std::string & str, +inline bool string_starts_with(const std::string & str, const std::string & prefix) { // While we wait for C++20's std::string::starts_with... return str.rfind(prefix, 0) == 0; } @@ -870,11 +870,11 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps"; -static std::string llm_ffn_exps_block_regex(int idx) { +inline std::string llm_ffn_exps_block_regex(int idx) { return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX); } -static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() { +inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() { return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() }; } From d612901116ab2066c7923372d4827032ff296bc4 Mon Sep 17 00:00:00 2001 From: AesSedai <7980540+AesSedai@users.noreply.github.com> Date: Mon, 16 Feb 2026 08:44:44 -0800 Subject: [PATCH 5/5] perplexity: add proper batching (#19661) --- tools/perplexity/perplexity.cpp | 154 ++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 65 deletions(-) diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index 1ead9c871e9..433b747f0d4 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -347,7 +347,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params int count = 0; double nll = 0.0; - LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + const int n_seq = std::max(1, n_batch / n_ctx); + LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); for (int i = 0; i < n_chunk; ++i) { const int start = i * params.ppl_stride; @@ -1737,11 +1738,21 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { } const int n_batch = params.n_batch; - const int num_batches = (n_ctx + n_batch - 1)/n_batch; + const int num_batches = (static_cast(n_ctx) + n_batch - 1) / n_batch; + // Calculate n_seq based on the logits file's n_ctx, but cap it at what the context supports + const int n_seq_max = llama_n_seq_max(ctx); + int n_seq = std::max(1, n_batch / static_cast(n_ctx)); + if (n_seq > n_seq_max) { + LOG_WRN("%s: calculated n_seq=%d exceeds context's n_seq_max=%d, capping at %d\n", + __func__, n_seq, n_seq_max, n_seq_max); + n_seq = n_seq_max; + } const int nv = 2*((n_vocab + 1)/2) + 4; const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + llama_batch batch = llama_batch_init(std::min(n_batch, static_cast(n_ctx)*n_seq), 0, 1); + std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); @@ -1750,6 +1761,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { logits.reserve(size_t(n_ctx) * n_vocab); } + LOG_INF("%s: computing over %d chunks, n_ctx=%u, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); + std::vector workers(std::thread::hardware_concurrency() - 1); auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) { @@ -1774,107 +1787,122 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { auto kld_ptr = kld_values.data(); auto p_diff_ptr = p_diff_values.data(); - for (int i = 0; i < n_chunk; ++i) { + const int first = n_ctx/2; + + for (int i = 0; i < n_chunk; i += n_seq) { const int start = i * n_ctx; const int end = start + n_ctx; - const auto t_start = std::chrono::high_resolution_clock::now(); + const int n_seq_batch = std::min(n_seq, n_chunk - i); - if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { - LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i); - return; - } + const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache llama_memory_clear(llama_get_memory(ctx), true); - llama_batch batch = llama_batch_init(n_batch, 0, 1); - for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); - // save original token and restore it after eval - const auto token_org = tokens[batch_start]; - - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[batch_start] = llama_vocab_bos(vocab); - } + int n_outputs = 0; common_batch_clear(batch); - for (int i = 0; i < batch_size; i++) { - common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true); + for (int seq = 0; seq < n_seq_batch; seq++) { + int seq_start = batch_start + seq*n_ctx; + + // save original token and restore it after eval + const auto token_org = tokens[seq_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[seq_start] = llama_vocab_bos(vocab); + } + + for (int k = 0; k < batch_size; ++k) { + const int pos = j*n_batch + k; + const bool need_logits = pos >= first; + common_batch_add(batch, tokens[seq_start + k], pos, { seq }, need_logits); + n_outputs += need_logits; + } + + // restore the original token in case it was set to BOS + tokens[seq_start] = token_org; } if (llama_decode(ctx, batch)) { - LOG_ERR("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to decode\n", __func__); llama_batch_free(batch); return; } - // restore the original token in case it was set to BOS - tokens[batch_start] = token_org; - - if (num_batches > 1) { + if (num_batches > 1 && n_outputs > 0) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab); } } - llama_batch_free(batch); - - const auto t_end = std::chrono::high_resolution_clock::now(); - if (i == 0) { + llama_synchronize(ctx); + const auto t_end = std::chrono::high_resolution_clock::now(); const float t_total = std::chrono::duration(t_end - t_start).count(); LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); + int total_seconds = (int)(t_total * n_chunk / n_seq); if (total_seconds >= 60*60) { LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } LOG("%.2f minutes\n", total_seconds / 60.0); + LOG("\n"); + LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); } - LOG("\n"); - LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); - const int first = n_ctx/2; - const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); - p_diff_ptr += n_ctx - 1 - first; - kld_ptr += n_ctx - 1 - first; + // Read log probs for each sequence in the batch + for (int seq = 0; seq < n_seq_batch; seq++) { + if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { + LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i + seq); + llama_batch_free(batch); + return; + } - LOG("%4d", i+1); + const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first); - auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); - const double ppl_val = exp(log_ppl.first); - const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); + process_logits(n_vocab, all_logits, tokens.data() + start + seq*n_ctx + first, n_ctx - 1 - first, + workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); + p_diff_ptr += n_ctx - 1 - first; + kld_ptr += n_ctx - 1 - first; - auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); - const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); - const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; - const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); + LOG("%4d", i + seq + 1); - auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); + auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); + const double ppl_val = exp(log_ppl.first); + const double ppl_unc = ppl_val * log_ppl.second; + LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); - auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); - const double p_diff_rms_val = sqrt(p_diff_mse.first); - const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); + const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); + const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; + const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); + LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); - double p_top_val = 1.*kld.n_same_top/kld.count; - double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); - LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); + auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); + LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); - LOG("\n"); + auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); + const double p_diff_rms_val = sqrt(p_diff_mse.first); + const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; + LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + + double p_top_val = 1.*kld.n_same_top/kld.count; + double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); + LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); + + LOG("\n"); + } logits.clear(); } + + llama_batch_free(batch); LOG("\n"); if (kld.count < 100) return; // we do not wish to do statistics on so few values @@ -1996,7 +2024,7 @@ int main(int argc, char ** argv) { const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence; - if (ppl) { + if (ppl || params.kl_divergence) { const int32_t n_seq = std::max(1, params.n_batch / n_ctx); const int32_t n_kv = n_seq * n_ctx; @@ -2006,12 +2034,8 @@ int main(int argc, char ** argv) { params.n_batch = std::min(params.n_batch, n_kv); } else { params.n_batch = std::min(params.n_batch, params.n_ctx); - if (params.kl_divergence) { - params.n_parallel = 1; - } else { - // ensure there's at least enough seq_ids for HellaSwag - params.n_parallel = std::max(4, params.n_parallel); - } + // ensure there's at least enough seq_ids for HellaSwag + params.n_parallel = std::max(4, params.n_parallel); } if (params.ppl_stride > 0) {