Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ Maintainers reserve the right to decline review or close pull requests for any r
# Code maintenance
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
- Reviewing and merging related PRs
- Fixing related bugs
- Providing developer guidance/support
Expand Down
2 changes: 1 addition & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2399,7 +2399,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.fit_params = false;
} else {
throw std::runtime_error(
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_ARG_FIT"));
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ std::string common_detokenize(
// Embedding utils
//

// TODO: repace embd_norm with an enum
// TODO: replace embd_norm with an enum
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);

float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
Expand Down
2 changes: 1 addition & 1 deletion common/debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml
// prints tensors that are processed in the computation graph
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
// The template parameter determins whether an error should be thrown whenever a NaN is encountered
// The template parameter determines whether an error should be thrown whenever a NaN is encountered
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
// The callback data will be passed as the third parameter (user_data)
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
Expand Down
2 changes: 1 addition & 1 deletion common/jinja/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ The llama.cpp Jinja engine introduces `jinja::string` (see `jinja/string.h`), wh
- **One-to-many** (e.g., split): result is marked `is_input` **only if ALL** input parts are marked `is_input`
- **Many-to-one** (e.g., join): same as one-to-many

For string concatenation, string parts will be appended to the new string as-is, while perserving the `is_input` flag.
For string concatenation, string parts will be appended to the new string as-is, while preserving the `is_input` flag.

**Enabling Input Marking:**

Expand Down
12 changes: 6 additions & 6 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4031,7 +4031,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
# split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = data_torch.shape
del c1, c2, kh, kw # unused
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
assert kt == 2, "Current implementation only support temporal_patch_size of 2"
yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...])
yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
else:
Expand Down Expand Up @@ -5404,7 +5404,7 @@ def set_gguf_parameters(self):
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
linear_attn_config = self.hparams["linear_attn_config"]
# n_head == 0 for KDA layers, n_head > 0 for MLA layers
# full_attention_layers list will be used to distingush layer type
# full_attention_layers list will be used to distinguish layer type
_num_kv_heads = list()
_full_attn_layers = linear_attn_config["full_attn_layers"]
for il in range(self.hparams["num_hidden_layers"]):
Expand Down Expand Up @@ -6505,7 +6505,7 @@ def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
# default values below are taken from HF tranformers code
# default values below are taken from HF transformers code
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)
# calculate proj_scale_factor (used by tinygemma3 test model)
Expand Down Expand Up @@ -7097,7 +7097,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

if bid == 0 and "time_mix_a" in new_name:
# dummy v0/v1/v2 on first layer
# easist way to make llama happy
# easiest way to make llama happy
yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)

yield (new_name, data_torch)
Expand Down Expand Up @@ -9596,7 +9596,7 @@ def __init__(self, *args, **kwargs):
# NOTE: Explicitly include hparam prefix prefix for d_model to
# disambiguate with top-level head_dim
# NOTE 2: If needed for future models, this can be isolated in a method
# to separate the prefix setting and teh keys used
# to separate the prefix setting and the keys used
self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
self.n_group = self.find_hparam(["n_groups", "num_groups"])
self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
Expand Down Expand Up @@ -9743,7 +9743,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_value_length(self.head_dim)

# Set feed_forward_length
# NOTE: This will trigger an override warning. This is preferrable to
# NOTE: This will trigger an override warning. This is preferable to
# duplicating all the parent logic
if not self.is_moe:
n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
Expand Down
4 changes: 2 additions & 2 deletions docs/backend/CANN.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

**Llama.cpp + CANN**

The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are integrated to CANN Toolkit and kernels to using Ascend NPU directly.

## News

Expand Down Expand Up @@ -210,7 +210,7 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager
# and install driver.
sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
```
If the following messaage appers, firmware is installed successfully.
If the following message appears, firmware is installed successfully.
```sh
Firmware package installed successfully!
```
Expand Down
2 changes: 1 addition & 1 deletion docs/backend/SYCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

- Remove **build** folder or try a clean-build.

- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
- I can **not** see `[ext_oneapi_level_zero:gpu]` after installing the GPU driver on Linux.

Please double-check with `sudo sycl-ls`.

Expand Down
2 changes: 1 addition & 1 deletion docs/backend/snapdragon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920
### Windows

All artifacts are already installed in the `pkg-snapdragon` folder.
To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
To run, adapt below instructions to use Powershell scripts in `scripts/snapdragon/windows`.

## How to Run

Expand Down
2 changes: 1 addition & 1 deletion docs/backend/snapdragon/windows.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ Once the build is complete HTP ops libraries will be installed like this
-a---- 1/22/2026 6:01 PM 4139 libggml-htp.cat
```

The .cat file, the signature and proper certicate installation can be verified with
The .cat file, the signature and proper certificate installation can be verified with

```
> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
Expand Down
4 changes: 2 additions & 2 deletions docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ You can verify that KleidiAI is being used by running
```bash
./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
```
If KleidiAI is enabled, the ouput will contain a line similar to:
If KleidiAI is enabled, the output will contain a line similar to:
```
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
```
Expand Down Expand Up @@ -699,7 +699,7 @@ To read documentation for how to build on Android, [click here](./android.md)

## WebGPU [In Progress]

The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.

In the llama.cpp directory, build with CMake:

Expand Down
4 changes: 2 additions & 2 deletions docs/multimodal/MobileVLM.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ llama_print_timings: total time = 5990.25 ms / 202 tokens
Just the same as above.
**ouput**
**output**
```sh
encode_image_with_clip: image embedding created: 144 tokens

Expand All @@ -305,7 +305,7 @@ llama_print_timings: total time = 15513.95 ms / 412 tokens
## Run on Intel(R) Core(TM) Ultra7 115H
### operation system
Windows11
### comiple
### compile
```sh
make -j32
```
Expand Down
2 changes: 1 addition & 1 deletion examples/debug/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This is a utility intended to help debug a model by registering a callback that
logs GGML operations and tensor data. It can also store the generated logits or
embeddings as well as the prompt and token ids for comparision with the original
embeddings as well as the prompt and token ids for comparison with the original
model.

### Usage
Expand Down
4 changes: 2 additions & 2 deletions examples/diffusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ Choose one of the following scheduling methods:
- `-b`: Batch size

### Examples
#### Dream architechture:
#### Dream architecture:
```
llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
```

#### LLaDA architechture:
#### LLaDA architecture:
```
llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
```
Expand Down
4 changes: 2 additions & 2 deletions examples/llama.vim
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ highlight llama_hl_info guifg=#77ff2f ctermfg=119
" n_prefix: number of lines before the cursor location to include in the local prefix
" n_suffix: number of lines after the cursor location to include in the local suffix
" n_predict: max number of tokens to predict
" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported)
" t_max_predict_ms: max alloted time for the prediction
" t_max_prompt_ms: max allotted time for the prompt processing (TODO: not yet supported)
" t_max_predict_ms: max allotted time for the prediction
" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
" auto_fim: trigger FIM completion automatically on cursor movement
" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
Expand Down
10 changes: 5 additions & 5 deletions examples/model-conversion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Command line arguments take precedence over environment variables when both are

In cases where the transformer implementation for the model has not been released
yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
will then cause the transformer implementation to be loaded explicitely and not
will then cause the transformer implementation to be loaded explicitly and not
use AutoModelForCausalLM:
```
export UNRELEASED_MODEL_NAME=SomeNewModel
Expand Down Expand Up @@ -120,7 +120,7 @@ The converted model can be inspected using the following command:
(venv) $ make causal-run-converted-model
```

### Model logits verfication
### Model logits verification
The following target will run the original model and the converted model and
compare the logits:
```console
Expand Down Expand Up @@ -235,7 +235,7 @@ new model the model can be converted to GGUF format using the following command:
(venv) $ make embedding-run-converted-model
```

### Model logits verfication
### Model logits verification
The following target will run the original model and the converted model (which
was done manually in the previous steps) and compare the logits:
```console
Expand Down Expand Up @@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO

## HuggingFace utilities
The following targets are useful for creating collections and model repositories
on Hugging Face in the the ggml-org. These can be used when preparing a relase
on Hugging Face in the the ggml-org. These can be used when preparing a release
to script the process for new model releases.

For the following targets a `HF_TOKEN` environment variable is required.
Expand All @@ -347,7 +347,7 @@ For the following targets a `HF_TOKEN` environment variable is required.
> $ unset HF_TOKEN

### Create a new Hugging Face Model (model repository)
This will create a new model repsository on Hugging Face with the specified
This will create a new model repository on Hugging Face with the specified
model name.
```console
(venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
Expand Down
4 changes: 2 additions & 2 deletions examples/sycl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.

|Tool Name| Function|Status|
|-|-|-|
|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, etc.|Support|

### llama-ls-sycl-device

List all SYCL devices with ID, compute capability, max work group size, ect.
List all SYCL devices with ID, compute capability, max work group size, etc.

1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.

Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ extern "C" {
Example usage:
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
// preferrably to run on the same backend as the buffer
// preferably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml-opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ extern "C" {
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);

// set gradients to zero, initilize loss, and optionally reset the optimizer
// set gradients to zero, initialize loss, and optionally reset the optimizer
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);

GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2575,7 +2575,7 @@ extern "C" {
struct ggml_tensor * grad,
struct ggml_tensor * sgd_params); // alpha, weight decay

// build forward mutiple tensors and select one of them for computing
// build forward multiple tensors and select one of them for computing
// this is useful for creating graphs that have constant topology but compute different things based on the input
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
//
Expand Down
6 changes: 3 additions & 3 deletions ggml/src/ggml-cpu/amx/mmq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ struct tile_config_t{
// will be needed.
//
// Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
// and the single batch gemm (m=1) has a special fast path with `avx512-vnni`.
//
// ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
// advanced-matrix-extensions-intrinsics-functions.html
Expand Down Expand Up @@ -1379,8 +1379,8 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
// sum of offsets, shared across COLS
//
// avx512-vnni does not have `_mm512_dpbssd_epi32`,
// need to transfrom ss to us:
// a * (b - 8) is equavilent to b * a - 8 * a
// need to transform ss to us:
// a * (b - 8) is equivalent to b * a - 8 * a
// s u u u s u s
//
__m512i vcomp;
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cpu/arch/arm/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -968,7 +968,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi

const int vector_length = ggml_cpu_get_sve_cnt()*8;

//VLA Implemenation for SVE
//VLA Implementation for SVE
switch (vector_length) {
case 128:
{
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cpu/arch/arm/repack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,

const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;

// Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
// Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns
// but still need the qs to use the low and hi bits from q4
const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
int8x16_t q8_qs[8];
Expand Down Expand Up @@ -3796,7 +3796,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n,

for (int b = 0; b < nb; b++) {
// bsums pairs belongs to the same q8_k subblock
// 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
// 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
const int16x8_t bsums[4]{
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
Expand Down
Loading
Loading