Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7574,6 +7574,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
return false;
}

if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) {
// Intel Windows proprietary driver tuning
switch (src0_type) {
case GGML_TYPE_MXFP4:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return false;
default:
return true;
}
}

switch (src0_type) {
// From tests on A770 Linux, may need more tuning
case GGML_TYPE_Q4_0:
Expand Down
9 changes: 8 additions & 1 deletion ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ struct ggml_webgpu_shader_lib_context {
size_t wg_mem_limit_bytes = 0;
bool inplace = false;
bool overlap = false;
bool src_overlap = false;
bool supports_subgroup_matrix = false;
uint32_t sg_mat_m = 0;
uint32_t sg_mat_n = 0;
Expand Down Expand Up @@ -179,9 +180,10 @@ struct ggml_webgpu_binary_pipeline_key {
int op;
bool inplace;
bool overlap;
bool src_overlap;

bool operator==(const ggml_webgpu_binary_pipeline_key & other) const {
return type == other.type && op == other.op && inplace == other.inplace && overlap == other.overlap;
return type == other.type && op == other.op && inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
}
};

Expand All @@ -192,6 +194,7 @@ struct ggml_webgpu_binary_pipeline_key_hash {
ggml_webgpu_hash_combine(seed, key.op);
ggml_webgpu_hash_combine(seed, key.inplace);
ggml_webgpu_hash_combine(seed, key.overlap);
ggml_webgpu_hash_combine(seed, key.src_overlap);
return seed;
}
};
Expand Down Expand Up @@ -1044,6 +1047,7 @@ class ggml_webgpu_shader_lib {
.op = context.dst->op,
.inplace = context.inplace,
.overlap = context.overlap,
.src_overlap = context.src_overlap,
};

auto it = binary_pipelines.find(key);
Expand Down Expand Up @@ -1076,6 +1080,9 @@ class ggml_webgpu_shader_lib {
} else if (key.overlap) {
defines.push_back("OVERLAP");
variant += "_overlap";
} else if (key.src_overlap) {
defines.push_back("SRC_OVERLAP");
variant += "_src_overlap";
}

defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
Expand Down
147 changes: 106 additions & 41 deletions ggml/src/ggml-webgpu/ggml-webgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,28 @@ struct webgpu_buf_pool {
// which can run on a different thread than the calling thread.
std::mutex mutex;
std::condition_variable cv;
size_t cur_pool_size;
size_t max_pool_size;
wgpu::Device device;
wgpu::BufferUsage host_buf_usage;
wgpu::BufferUsage dev_buf_usage;
size_t buf_size;
bool should_grow;

void init(wgpu::Device device,
int num_bufs,
size_t buf_size,
wgpu::BufferUsage dev_buf_usage,
wgpu::BufferUsage host_buf_usage) {
wgpu::BufferUsage host_buf_usage,
bool should_grow = false,
size_t max_pool_size = WEBGPU_NUM_PARAM_BUFS * 2) {
this->max_pool_size = max_pool_size;
this->cur_pool_size = num_bufs;
this->device = device;
this->host_buf_usage = host_buf_usage;
this->dev_buf_usage = dev_buf_usage;
this->buf_size = buf_size;
this->should_grow = should_grow;
for (int i = 0; i < num_bufs; i++) {
wgpu::Buffer host_buf;
wgpu::Buffer dev_buf;
Expand All @@ -150,6 +166,25 @@ struct webgpu_buf_pool {

webgpu_pool_bufs alloc_bufs() {
std::unique_lock<std::mutex> lock(mutex);
if (!free.empty()) {
webgpu_pool_bufs bufs = free.back();
free.pop_back();
return bufs;
}

// Try growing the pool if no free buffers
if (free.empty() && cur_pool_size < max_pool_size && should_grow) {
cur_pool_size++;
wgpu::Buffer host_buf;
wgpu::Buffer dev_buf;
ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_pool_buf");
ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_pool_buf");

if (!(host_buf && dev_buf)) {
GGML_ABORT("webgpu_buf_pool: failed to allocate buffers");
}
return webgpu_pool_bufs{ host_buf, dev_buf };
}
cv.wait(lock, [this] { return !free.empty(); });
webgpu_pool_bufs bufs = free.back();
free.pop_back();
Expand Down Expand Up @@ -243,6 +278,7 @@ struct webgpu_gpu_profile_buf_pool {
#endif

struct webgpu_command {
uint32_t num_kernels;
wgpu::CommandBuffer commands;
std::vector<webgpu_pool_bufs> params_bufs;
std::optional<webgpu_pool_bufs> set_rows_error_bufs;
Expand Down Expand Up @@ -280,7 +316,6 @@ struct webgpu_global_context_struct {

webgpu_buf_pool memset_buf_pool;
std::map<int, webgpu_pipeline> memset_pipelines; // variant or type index
std::atomic_uint inflight_threads = 0;

#ifdef GGML_WEBGPU_CPU_PROFILE
// Profiling: labeled CPU time in ms (total)
Expand Down Expand Up @@ -426,13 +461,9 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,
static void ggml_backend_webgpu_wait(webgpu_global_context & ctx,
std::vector<webgpu_submission_futures> & futures,
bool block = true) {
// If we have too many in-flight submissions, wait on the oldest one first. If
// there are many threads, inflight_max may be 0, meaning that we must wait on
// all futures.
uint64_t timeout_ms = block ? UINT64_MAX : 0;
uint32_t inflight_threads = ctx->inflight_threads;
uint32_t inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
while (futures.size() >= inflight_max && futures.size() > 0) {
// If we have too many in-flight submissions, wait on the oldest one first.
uint64_t timeout_ms = block ? UINT64_MAX : 0;
while (futures.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD) {
ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
futures.erase(futures.begin());
}
Expand Down Expand Up @@ -651,6 +682,7 @@ static webgpu_command ggml_backend_webgpu_build_multi(
result.commands = commands;
result.params_bufs = params_bufs_list;
result.set_rows_error_bufs = set_rows_error_bufs;
result.num_kernels = pipelines.size();
#ifdef GGML_WEBGPU_GPU_PROFILE
result.timestamp_query_bufs = ts_bufs;
// TODO: handle multiple pipeline names
Expand Down Expand Up @@ -788,6 +820,7 @@ static bool ggml_webgpu_tensor_overlap(ggml_tensor * a, ggml_tensor * b) {
struct binary_overlap_flags {
bool inplace; // src0 == dst
bool overlap; // src1 == dst
bool src_overlap;
};

static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0,
Expand All @@ -796,6 +829,7 @@ static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0
binary_overlap_flags flags = {};
flags.inplace = ggml_webgpu_tensor_equal(src0, dst);
flags.overlap = ggml_webgpu_tensor_overlap(src1, dst);
flags.src_overlap = ggml_webgpu_tensor_overlap(src0, src1);

return flags;
}
Expand Down Expand Up @@ -1353,6 +1387,7 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup,
.inplace = flags.inplace,
.overlap = flags.overlap,
.src_overlap = flags.src_overlap,
};

webgpu_pipeline pipeline = ctx->shader_lib->get_binary_pipeline(shader_lib_ctx);
Expand All @@ -1361,11 +1396,28 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,

uint32_t ne = (uint32_t) ggml_nelements(dst);

size_t src0_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, src0);
size_t src1_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, src1);

uint32_t offset_merged_src0 = 0;
uint32_t offset_merged_src1 = 0;
if (flags.src_overlap) {
size_t min_off = std::min(src0_webgpu_tensor_align_offset, src1_webgpu_tensor_align_offset);
offset_merged_src0 = (uint32_t) ((src0_webgpu_tensor_align_offset - min_off) / ggml_type_size(src0->type));
offset_merged_src1 = (uint32_t) ((src1_webgpu_tensor_align_offset - min_off) / ggml_type_size(src0->type));
}

std::vector<uint32_t> params = {
ne,
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
offset_merged_src0,
offset_merged_src1,
(uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
Expand All @@ -1381,25 +1433,43 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,

std::vector<wgpu::BindGroupEntry> entries;

entries.push_back({
.binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
.size = ggml_webgpu_tensor_binding_size(ctx, src0),
});

entries.push_back({
.binding = 1,
.buffer = ggml_webgpu_tensor_buf(src1),
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
.size = ggml_webgpu_tensor_binding_size(ctx, src1),
});

if (!flags.inplace && !flags.overlap) {
entries.push_back({ .binding = 2,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
if (flags.src_overlap) {
size_t merged_offset = std::min(src0_webgpu_tensor_align_offset, src1_webgpu_tensor_align_offset);
size_t merged_end = std::max(src0_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, src0),
src1_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, src1));
entries.push_back({
.binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = merged_offset,
.size = merged_end - merged_offset,
});
entries.push_back({
.binding = 1,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst),
});
} else {
entries.push_back({
.binding = 0,
.buffer = ggml_webgpu_tensor_buf(src0),
.offset = src0_webgpu_tensor_align_offset,
.size = ggml_webgpu_tensor_binding_size(ctx, src0),
});
entries.push_back({
.binding = 1,
.buffer = ggml_webgpu_tensor_buf(src1),
.offset = src1_webgpu_tensor_align_offset,
.size = ggml_webgpu_tensor_binding_size(ctx, src1),
});
if (!flags.inplace && !flags.overlap) {
entries.push_back({
.binding = 2,
.buffer = ggml_webgpu_tensor_buf(dst),
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
.size = ggml_webgpu_tensor_binding_size(ctx, dst),
});
}
}

uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
Expand Down Expand Up @@ -2043,19 +2113,17 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str

WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);

ctx->global_ctx->inflight_threads++;

std::vector<webgpu_command> commands;
std::vector<webgpu_submission_futures> futures;
uint32_t num_batched_kernels = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
commands.push_back(*cmd);
num_batched_kernels += cmd.value().num_kernels;
}
// compute the batch size based on the number of inflight threads
uint32_t inflight_threads = ctx->global_ctx->inflight_threads;
uint32_t batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
if (commands.size() >= batch_size) {

if (num_batched_kernels >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
num_batched_kernels = 0;
futures.push_back(ggml_backend_webgpu_submit(ctx->global_ctx, commands, ctx->param_buf_pool,
&ctx->set_rows_error_buf_pool));
// Process events and check for completed submissions
Expand All @@ -2071,7 +2139,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
}

ggml_backend_webgpu_wait(ctx->global_ctx, futures);
ctx->global_ctx->inflight_threads--;
WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx->global_ctx);
return GGML_STATUS_SUCCESS;
}
Expand Down Expand Up @@ -2689,7 +2756,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
webgpu_ctx->param_buf_pool.init(webgpu_ctx->global_ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES,
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite, true);
webgpu_ctx->set_rows_error_buf_pool.init(webgpu_ctx->global_ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS,
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
Expand Down Expand Up @@ -2816,10 +2883,8 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
// TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE
// see https://github.com/ggml-org/llama.cpp/pull/16857
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) &&
(src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
(src1->type == op->type);
break;
case GGML_OP_CPY:
case GGML_OP_CONT:
Expand Down
Loading
Loading