diff --git a/build-xcframework.sh b/build-xcframework.sh index e8af16211fd..d45af083f5e 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -43,11 +43,6 @@ COMMON_CMAKE_ARGS=( -DGGML_OPENMP=${GGML_OPENMP} ) -XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }') -MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1) -MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2) -echo "Detected Xcode version: $XCODE_VERSION" - check_required_tool() { local tool=$1 local install_message=$2 @@ -60,9 +55,12 @@ check_required_tool() { } echo "Checking for required tools..." check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)" -check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" -check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)" -check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" +check_required_tool "xcrun" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" + +XCODE_VERSION=$(xcrun xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }') +MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1) +MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2) +echo "Detected Xcode version: $XCODE_VERSION" set -e @@ -260,7 +258,7 @@ combine_static_libraries() { # Since we have multiple architectures libtool will find object files that do not # match the target architecture. We suppress these warnings. - libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null + xcrun libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null # Determine SDK, architectures, and install_name based on platform and simulator flag. local sdk="" @@ -333,7 +331,7 @@ combine_static_libraries() { # Platform-specific post-processing for device builds if [[ "$is_simulator" == "false" ]]; then - if command -v xcrun vtool &>/dev/null; then + if xcrun -f vtool &>/dev/null; then case "$platform" in "ios") echo "Marking binary as a framework binary for iOS..." @@ -528,7 +526,7 @@ combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false" # Create XCFramework with correct debug symbols paths echo "Creating XCFramework..." -xcodebuild -create-xcframework \ +xcrun xcodebuild -create-xcframework \ -framework $(pwd)/build-ios-sim/framework/llama.framework \ -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \ -framework $(pwd)/build-ios-device/framework/llama.framework \ diff --git a/common/common.cpp b/common/common.cpp index ec15804c91a..26edcc383f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1223,7 +1223,7 @@ common_init_result_ptr common_init_from_params(common_params & params) { return res; } - int err = llama_apply_adapter_cvec( + int err = llama_set_adapter_cvec( lctx, cvec.data.data(), cvec.data.size(), @@ -1325,12 +1325,15 @@ std::string get_model_endpoint() { } void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { - llama_clear_adapter_lora(ctx); - for (auto & la : lora) { - if (la.scale != 0.0f) { - llama_set_adapter_lora(ctx, la.ptr, la.scale); - } + std::vector loras; + std::vector scales; + + for (auto & la: lora) { + loras.push_back(la.ptr); + scales.push_back(la.scale); } + + llama_set_adapters_lora(ctx, loras.data(), loras.size(), scales.data()); } struct llama_model_params common_model_params_to_llama(common_params & params) { diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 825080b588e..da0efdd7937 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -570,6 +570,7 @@ def prepare_tensors(self): self.match_model_tensor_name(new_name, key, bid) for key in ( gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP, gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.SSM_CONV1D, diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index b4ca9c5dd6f..3db7f126291 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1067,8 +1067,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_ADD_ID: - return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_ACC: + return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_REPEAT: case GGML_OP_CONV_TRANSPOSE_1D: return true; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index c04e9fc7ffa..3d5db0b79f5 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -620,8 +620,8 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) { GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); GGML_ASSERT(op->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(op->src[0])); - GGML_ASSERT(ggml_is_contiguous(op->src[1])); + GGML_ASSERT(ggml_is_contiguous_rows(op->src[0])); + GGML_ASSERT(ggml_is_contiguous_rows(op->src[1])); const size_t pnb1 = ((const int32_t *) op->op_params)[0]; const size_t pnb2 = ((const int32_t *) op->op_params)[1]; @@ -671,10 +671,10 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) { } ggml_metal_kargs_bin args = { - /*.ne00 =*/ ne00, - /*.ne01 =*/ ne01, - /*.ne02 =*/ ne02, - /*.ne03 =*/ ne03, + /*.ne00 =*/ ne10, + /*.ne01 =*/ ne11, + /*.ne02 =*/ ne12, + /*.ne03 =*/ ne13, /*.nb00 =*/ nb00, /*.nb01 =*/ pnb1, /*.nb02 =*/ pnb2, @@ -687,10 +687,10 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) { /*.nb11 =*/ nb11, /*.nb12 =*/ nb12, /*.nb13 =*/ nb13, - /*.ne0 =*/ ne0, - /*.ne1 =*/ ne1, - /*.ne2 =*/ ne2, - /*.ne3 =*/ ne3, + /*.ne0 =*/ ne10, + /*.ne1 =*/ ne11, + /*.ne2 =*/ ne12, + /*.ne3 =*/ ne13, /*.nb0 =*/ nb0, /*.nb1 =*/ pnb1, /*.nb2 =*/ pnb2, @@ -707,7 +707,13 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); - const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne00); + const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + + int nth = 1; + + while (2*nth < args.ne0 && nth < nth_max) { + nth *= 2; + } ggml_metal_encoder_dispatch_threadgroups(enc, ne11, ne12, ne13, nth, 1, 1); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 82933ae0330..114992da08d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -92,6 +92,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } #define VK_VENDOR_ID_APPLE 0x106b #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de +#define VK_VENDOR_ID_QUALCOMM 0x5143 #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256 @@ -687,6 +688,7 @@ struct vk_device_struct { vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; vk_pipeline pipeline_acc_f32; + vk_pipeline pipeline_set_f32; // [src0 0=fp32,1=fp16][src1 0=fp32,1=fp16][dst 0=fp32,1=fp16] vk_pipeline pipeline_add[2][2][2]; @@ -4080,7 +4082,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -4181,7 +4183,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_add_id_f32, "add_id_f32", add_id_f32_len, add_id_f32_data, "main", 4, sizeof(vk_op_add_id_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_set_f32, "set_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 0}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -5641,6 +5644,10 @@ static void ggml_vk_instance_init() { driver_priorities[vk::DriverId::eMesaNvk] = 2; #endif break; + case VK_VENDOR_ID_QUALCOMM: + driver_priorities[vk::DriverId::eQualcommProprietary] = 1; + driver_priorities[vk::DriverId::eMesaTurnip] = 2; + break; } driver_priorities[vk::DriverId::eMesaDozen] = 100; @@ -8817,6 +8824,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_acc_f32; } return nullptr; + case GGML_OP_SET: + if (src0->type == src1->type && src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32)) { + return ctx->device->pipeline_set_f32; + } + return nullptr; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_MUL: @@ -9808,7 +9821,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const int nb3 = dst->op_params[2] / src0_type_size; // 4 bytes of float32 int offset = dst->op_params[3] / src0_type_size; // offset in bytes - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ACC, { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, @@ -10626,8 +10639,10 @@ static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& sub } static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); + const float * op_params = (const float *)dst->op_params; + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[0]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, std::move(p)); } static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { @@ -12502,6 +12517,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_ACC: + case GGML_OP_SET: ggml_vk_acc(ctx, compute_ctx, src0, src1, node); break; @@ -14898,8 +14914,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return true; case GGML_OP_NORM: case GGML_OP_GROUP_NORM: - case GGML_OP_L2_NORM: return ggml_is_contiguous(op->src[0]); + case GGML_OP_L2_NORM: + return ggml_is_contiguous_rows(op->src[0]) && + op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_MUL: @@ -14962,7 +14980,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm } return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_ACC: - return op->src[0]->type == GGML_TYPE_F32; + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_SET: + return op->src[0]->type == op->src[1]->type && op->src[0]->type == op->type && + (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_I32); case GGML_OP_CONCAT: return ggml_type_size(op->src[0]->type) == ggml_type_size(GGML_TYPE_F32); case GGML_OP_ADD1: @@ -15613,6 +15634,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]); } else if (tensor->op == GGML_OP_ACC) { tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); + } else if (tensor->op == GGML_OP_SET) { + tensor_clone = ggml_set(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); } else if (tensor->op == GGML_OP_NORM) { tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_GROUP_NORM) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp index 3d61168b56f..6ba3d1d89e0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp @@ -3,6 +3,9 @@ #include "types.glsl" #include "generic_binary_head.glsl" +// false for SET, true for ACC +layout(constant_id = 1) const bool ACC = true; + layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; void main() { @@ -23,7 +26,11 @@ void main() { uint i00, i01, i02, i03; if (i0 < p.ne10 && i1 < p.ne11 && i2 < p.ne12 && i3 < p.ne13) { - data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i0, i1, i2, i3)])); + if (ACC) { + data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i0, i1, i2, i3)])); + } else { + data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_b[get_boffset() + src1_idx(i0, i1, i2, i3)])); + } } else { data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx])); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp index 83ef2f87958..7d0a1de0df9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp @@ -1,6 +1,6 @@ #version 450 -#include "generic_head.glsl" +#include "generic_unary_head.glsl" #include "types.glsl" #extension GL_EXT_control_flow_attributes : enable @@ -8,19 +8,22 @@ layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - shared FLOAT_TYPE sum[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; + const uint i3 = row / (p.ne11 * p.ne12); + const uint i3_offset = i3 * p.ne12 * p.ne11; + const uint i2 = (row - i3_offset) / p.ne11; + const uint i2_offset = i2 * p.ne11; + const uint i1 = row - i3_offset - i2_offset; + sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp - [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { - const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]); + [[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) { + const FLOAT_TYPE xi = FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]); sum[tid] += xi * xi; } @@ -35,7 +38,7 @@ void main() { const FLOAT_TYPE scale = inversesqrt(max(sum[0], FLOAT_TYPE(p.param1))); - [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { - data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col])); + [[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) { + data_d[i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0] = D_TYPE(scale * FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0])); } } diff --git a/include/llama.h b/include/llama.h index 305623127cb..d2d7f59ebc6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -656,21 +656,12 @@ extern "C" { // The following functions operate on a llama_context, hence the naming: llama_verb_... - // Add a loaded LoRA adapter to given context - // This will not modify model's weight - LLAMA_API int32_t llama_set_adapter_lora( + // Set LoRa adapters on the context. Will only modify if the adapters currently in context are different. + LLAMA_API int32_t llama_set_adapters_lora( struct llama_context * ctx, - struct llama_adapter_lora * adapter, - float scale); - - // Remove a specific LoRA adapter from given context - // Return -1 if the adapter is not present in the context - LLAMA_API int32_t llama_rm_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter); - - // Remove all LoRA adapters from given context - LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx); + struct llama_adapter_lora ** adapters, + size_t n_adapters, + float * scales); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. @@ -678,7 +669,7 @@ extern "C" { // to an n_embd x n_layers buffer starting from layer 1. // il_start and il_end are the layer range the vector should apply to (both inclusive) // See llama_control_vector_load in common to load a control vector. - LLAMA_API int32_t llama_apply_adapter_cvec( + LLAMA_API int32_t llama_set_adapter_cvec( struct llama_context * ctx, const float * data, size_t len, diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py index 68db04dea9b..d1011f2b5ec 100755 --- a/scripts/sync_vendor.py +++ b/scripts/sync_vendor.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 import urllib.request +import os +import sys +import subprocess HTTPLIB_VERSION = "f80864ca031932351abef49b74097c67f14719c6" @@ -14,7 +17,8 @@ # "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h", "https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h", - f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/httplib.h": "vendor/cpp-httplib/httplib.h", + f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/httplib.h": "httplib.h", + f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/split.py": "split.py", f"https://raw.githubusercontent.com/yhirose/cpp-httplib/{HTTPLIB_VERSION}/LICENSE": "vendor/cpp-httplib/LICENSE", "https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h", @@ -24,19 +28,16 @@ print(f"downloading {url} to {filename}") # noqa: NP100 urllib.request.urlretrieve(url, filename) - # split cpp/h files for httplib - # see: https://github.com/yhirose/cpp-httplib/blob/master/split.py - if 'httplib.h' in filename: - border = '// ----------------------------------------------------------------------------' - with open(filename, 'r') as f: - content = f.read() - header, implementation, footer = content.split(border, 2) - fname_cpp = filename.replace('.h', '.cpp') - with open(filename, 'w') as fh: - fh.write(header) - fh.write(footer) - with open(fname_cpp, 'w') as fc: - fc.write('#include "httplib.h"\n') - fc.write('namespace httplib {\n') - fc.write(implementation.replace('\ninline ', '\n')) - fc.write('} // namespace httplib\n') +print("Splitting httplib.h...") # noqa: NP100 +try: + subprocess.check_call([ + sys.executable, "split.py", + "--extension", "cpp", + "--out", "vendor/cpp-httplib" + ]) +except Exception as e: + print(f"Error: {e}") # noqa: NP100 + sys.exit(1) +finally: + os.remove("split.py") + os.remove("httplib.h") diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 6b43ca19267..ac17e1a0fe2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1057,51 +1057,43 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) { return true; } -void llama_context::set_adapter_lora( - llama_adapter_lora * adapter, - float scale) { - LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale); +void llama_context::set_adapters_lora(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) { + LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters); - if (auto it = loras.find(adapter); it != loras.end()) { - if (it->second == scale) { - return; - } + if (adapters_lora_are_same(adapters, n_adapters, scales)) { + return; } - loras[adapter] = scale; + loras.clear(); + + for (size_t i = 0; i < n_adapters; i ++) { + if (scales[i] != 0.0f) { + loras[adapters[i]] = scales[i]; + } + } sched_need_reserve = true; } -bool llama_context::rm_adapter_lora( - llama_adapter_lora * adapter) { - LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter); - - auto it = loras.find(adapter); - if (it != loras.end()) { - loras.erase(it); +bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) { + LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters); - sched_need_reserve = true; - - return true; + if (n_adapters != loras.size()) { + return false; } - return false; -} - -void llama_context::clear_adapter_lora() { - LLAMA_LOG_DEBUG("%s: call\n", __func__); + for (size_t i = 0; i < n_adapters; i ++) { + auto it = loras.find(adapters[i]); - if (loras.empty()) { - return; + if (it == loras.end() || it->second != scales[i]) { + return false; + } } - loras.clear(); - - sched_need_reserve = true; + return true; } -bool llama_context::apply_adapter_cvec( +bool llama_context::set_adapter_cvec( const float * data, size_t len, int32_t n_embd, @@ -3209,35 +3201,28 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) { // llama adapter API -int32_t llama_set_adapter_lora( +int32_t llama_set_adapters_lora( llama_context * ctx, - llama_adapter_lora * adapter, - float scale) { - ctx->set_adapter_lora(adapter, scale); - - return 0; -} + llama_adapter_lora ** adapters, + size_t n_adapters, + float * scales) { + if (adapters == nullptr || scales == nullptr) { + GGML_ASSERT(n_adapters == 0 && "invalid llama_set_adapters_lora call"); + } -int32_t llama_rm_adapter_lora( - llama_context * ctx, - llama_adapter_lora * adapter) { - bool res = ctx->rm_adapter_lora(adapter); + ctx->set_adapters_lora(adapters, n_adapters, scales); - return res ? 0 : -1; -} - -void llama_clear_adapter_lora(llama_context * ctx) { - ctx->clear_adapter_lora(); + return 0; } -int32_t llama_apply_adapter_cvec( +int32_t llama_set_adapter_cvec( llama_context * ctx, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end) { - bool res = ctx->apply_adapter_cvec(data, len, n_embd, il_start, il_end); + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + bool res = ctx->set_adapter_cvec(data, len, n_embd, il_start, il_end); return res ? 0 : -1; } diff --git a/src/llama-context.h b/src/llama-context.h index d9951175744..37117ba7b67 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -105,16 +105,11 @@ struct llama_context { void set_causal_attn(bool value); void set_warmup(bool value); - void set_adapter_lora( - llama_adapter_lora * adapter, - float scale); + void set_adapters_lora(llama_adapter_lora ** adapters, size_t n_adapters, float * scales); - bool rm_adapter_lora( - llama_adapter_lora * adapter); + bool adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales); - void clear_adapter_lora(); - - bool apply_adapter_cvec( + bool set_adapter_cvec( const float * data, size_t len, int32_t n_embd, diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 0261e4c72c9..c03228e9ce2 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -504,6 +504,8 @@ struct llama_mmap::impl { } } #elif defined(_WIN32) + HANDLE hMapping = nullptr; + impl(struct llama_file * file, size_t prefetch, bool numa) { GGML_UNUSED(numa); @@ -511,7 +513,7 @@ struct llama_mmap::impl { HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id()); - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); if (hMapping == NULL) { DWORD error = GetLastError(); @@ -520,9 +522,9 @@ struct llama_mmap::impl { addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); DWORD error = GetLastError(); - CloseHandle(hMapping); if (addr == NULL) { + CloseHandle(hMapping); throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } @@ -554,9 +556,17 @@ struct llama_mmap::impl { } ~impl() { - if (!UnmapViewOfFile(addr)) { - LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); + if (hMapping) { + if (addr) { + if (!UnmapViewOfFile(addr)) { + LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + if (!CloseHandle(hMapping)) { + LLAMA_LOG_WARN("warning: CloseHandle failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } } } #else diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index a50c569b820..d818ed67d65 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5821,20 +5821,27 @@ struct test_l2_norm : public test_case { const ggml_type type; const std::array ne; const float eps; + bool v; std::string vars() override { - return VARS_TO_STR2(type, ne); + return VARS_TO_STR4(type, ne, eps, v); } test_l2_norm(ggml_type type = GGML_TYPE_F32, std::array ne = {64, 64, 320, 1}, - float eps = 1e-12f) - : type(type), ne(ne), eps(eps) {} + float eps = 1e-12f, + bool v = false) + : type(type), ne(ne), eps(eps), v(v) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_set_name(a, "a"); + if (v) { + a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0); + ggml_set_name(a, "view of a"); + } + ggml_tensor * out = ggml_l2_norm(ctx, a, eps); ggml_set_name(out, "out"); @@ -7596,7 +7603,8 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps)); } test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps)); - test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps)); + test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false)); + test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true)); } } diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index f4ff57b4c97..75fc856f545 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/.storybook/main.ts b/tools/server/webui/.storybook/main.ts index bfd16fa2245..4f6945f2108 100644 --- a/tools/server/webui/.storybook/main.ts +++ b/tools/server/webui/.storybook/main.ts @@ -1,17 +1,24 @@ import type { StorybookConfig } from '@storybook/sveltekit'; +import { dirname, resolve } from 'path'; +import { fileURLToPath } from 'url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); const config: StorybookConfig = { stories: ['../tests/stories/**/*.mdx', '../tests/stories/**/*.stories.@(js|ts|svelte)'], addons: [ '@storybook/addon-svelte-csf', '@chromatic-com/storybook', - '@storybook/addon-docs', + '@storybook/addon-vitest', '@storybook/addon-a11y', - '@storybook/addon-vitest' + '@storybook/addon-docs' ], - framework: { - name: '@storybook/sveltekit', - options: {} + framework: '@storybook/sveltekit', + viteFinal: async (config) => { + config.server = config.server || {}; + config.server.fs = config.server.fs || {}; + config.server.fs.allow = [...(config.server.fs.allow || []), resolve(__dirname, '../tests')]; + return config; } }; export default config; diff --git a/tools/server/webui/.storybook/preview.ts b/tools/server/webui/.storybook/preview.ts index 8d530e43e37..566dbfd289c 100644 --- a/tools/server/webui/.storybook/preview.ts +++ b/tools/server/webui/.storybook/preview.ts @@ -13,7 +13,7 @@ const preview: Preview = { }, backgrounds: { - disable: true + disabled: true }, a11y: { diff --git a/tools/server/webui/docs/flows/settings-flow.md b/tools/server/webui/docs/flows/settings-flow.md index 474aef01b09..40ad3bd94d7 100644 --- a/tools/server/webui/docs/flows/settings-flow.md +++ b/tools/server/webui/docs/flows/settings-flow.md @@ -49,14 +49,20 @@ sequenceDiagram settingsStore->>serverStore: defaultParams serverStore-->>settingsStore: {temperature, top_p, top_k, ...} - settingsStore->>ParamSvc: extractServerDefaults(defaultParams) - ParamSvc-->>settingsStore: Record + loop each SYNCABLE_PARAMETER + alt key NOT in userOverrides + settingsStore->>settingsStore: config[key] = serverDefault[key] + Note right of settingsStore: Non-overridden params adopt server default + else key in userOverrides + Note right of settingsStore: Keep user value, skip server default + end + end - settingsStore->>ParamSvc: mergeWithServerDefaults(config, serverDefaults) - Note right of ParamSvc: For each syncable parameter:
- If NOT in userOverrides → use server default
- If in userOverrides → keep user value - ParamSvc-->>settingsStore: mergedConfig + alt serverStore.props has webuiSettings + settingsStore->>settingsStore: Apply webuiSettings from server + Note right of settingsStore: Server-provided UI settings
(e.g. showRawOutputSwitch) + end - settingsStore->>settingsStore: config = mergedConfig settingsStore->>settingsStore: saveConfig() deactivate settingsStore @@ -67,11 +73,18 @@ sequenceDiagram UI->>settingsStore: updateConfig(key, value) activate settingsStore settingsStore->>settingsStore: config[key] = value - settingsStore->>settingsStore: userOverrides.add(key) - Note right of settingsStore: Mark as user-modified (won't be overwritten by server) + + alt value matches server default for key + settingsStore->>settingsStore: userOverrides.delete(key) + Note right of settingsStore: Matches server default, remove override + else value differs from server default + settingsStore->>settingsStore: userOverrides.add(key) + Note right of settingsStore: Mark as user-modified (won't be overwritten) + end + settingsStore->>settingsStore: saveConfig() - settingsStore->>LS: set("llama-config", config) - settingsStore->>LS: set("llama-userOverrides", [...userOverrides]) + settingsStore->>LS: set(CONFIG_LOCALSTORAGE_KEY, config) + settingsStore->>LS: set(USER_OVERRIDES_LOCALSTORAGE_KEY, [...userOverrides]) deactivate settingsStore UI->>settingsStore: updateMultipleConfig({key1: val1, key2: val2}) @@ -88,10 +101,9 @@ sequenceDiagram UI->>settingsStore: resetConfig() activate settingsStore - settingsStore->>settingsStore: config = SETTING_CONFIG_DEFAULT + settingsStore->>settingsStore: config = {...SETTING_CONFIG_DEFAULT} settingsStore->>settingsStore: userOverrides.clear() - settingsStore->>settingsStore: syncWithServerDefaults() - Note right of settingsStore: Apply server defaults for syncable params + Note right of settingsStore: All params reset to defaults
Next syncWithServerDefaults will adopt server values settingsStore->>settingsStore: saveConfig() deactivate settingsStore diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIconsCodeBlock.svelte b/tools/server/webui/src/lib/components/app/actions/ActionIconsCodeBlock.svelte index 54ff0af1a07..b20e79b5e0b 100644 --- a/tools/server/webui/src/lib/components/app/actions/ActionIconsCodeBlock.svelte +++ b/tools/server/webui/src/lib/components/app/actions/ActionIconsCodeBlock.svelte @@ -1,6 +1,6 @@ + +
+ + + + + + + + +

{triggerTooltipText}

+
+
+
+ + + {#each actions as item (item.id)} + {@const hasDisabledTooltip = !!item.disabled && !!item.disabledReason} + {@const hasEnabledTooltip = !item.disabled && !!item.tooltip} + + {#if hasDisabledTooltip} + + + + {#if item.id === 'images'} + + {:else if item.id === 'audio'} + + {:else if item.id === 'text'} + + {:else if item.id === 'pdf'} + + {:else} + + {/if} + + {item.label} + + + + +

{item.disabledReason}

+
+
+ {:else if hasEnabledTooltip} + + + handleActionClick(item.id)}> + {#if item.id === 'images'} + + {:else if item.id === 'audio'} + + {:else if item.id === 'text'} + + {:else if item.id === 'pdf'} + + {:else} + + {/if} + + {item.label} + + + + +

{item.tooltip}

+
+
+ {:else} + handleActionClick(item.id)}> + {#if item.id === 'images'} + + {:else if item.id === 'audio'} + + {:else if item.id === 'text'} + + {:else if item.id === 'pdf'} + + {:else} + + {/if} + + {item.label} + + {/if} + {/each} +
+
+
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte index c621a69e050..cf5aca42a1f 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte @@ -2,7 +2,7 @@ import { Square } from '@lucide/svelte'; import { Button } from '$lib/components/ui/button'; import { - ChatFormActionFileAttachments, + ChatFormActionAttachmentsDropdown, ChatFormActionRecord, ChatFormActionSubmit, ModelsSelector @@ -157,7 +157,7 @@ const { handleModelChange } = useModelChangeValidation({ getRequiredModalities: () => usedModalities(), - onValidationFailure: async (previousModelId) => { + onValidationFailure: async (previousModelId: string | null) => { if (previousModelId) { await modelsStore.selectModelById(previousModelId); } @@ -166,32 +166,39 @@
- - - +
+ +
+ +
+ +
{#if isLoading} {:else if shouldShowRecordButton} diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte index 3470e2f711d..25895c83b7f 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte @@ -62,8 +62,8 @@ assistantMessages: number; messageTypes: string[]; } | null>(null); - let editedContent = $state(message.content); - let editedExtras = $state(message.extra ? [...message.extra] : []); + let editedContent = $derived(message.content); + let editedExtras = $derived(message.extra ? [...message.extra] : []); let editedUploadedFiles = $state([]); let isEditing = $state(false); let showDeleteDialog = $state(false); diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte index 1cb6b274b67..867def5fc3c 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte @@ -105,7 +105,7 @@ const { handleModelChange } = useModelChangeValidation({ getRequiredModalities: () => conversationsStore.getModalitiesUpToMessage(message.id), - onSuccess: (modelName) => onRegenerate(modelName) + onSuccess: (modelName: string) => onRegenerate(modelName) }); function handleCopyModel() { diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte index f812ea2fd9d..c216ea690b1 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte @@ -133,7 +133,7 @@ const { handleModelChange } = useModelChangeValidation({ getRequiredModalities, - onValidationFailure: async (previousModelId) => { + onValidationFailure: async (previousModelId: string | null) => { if (previousModelId) { await modelsStore.selectModelById(previousModelId); } diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte index d457e042fcb..b53e82aaf9c 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte @@ -28,7 +28,7 @@ initialView = ChatMessageStatsView.GENERATION }: Props = $props(); - let activeView: ChatMessageStatsView = $state(initialView); + let activeView: ChatMessageStatsView = $derived(initialView); let hasAutoSwitchedToGeneration = $state(false); // In live mode: auto-switch to GENERATION tab when prompt processing completes diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte index a5450e6af89..3d432e26bc7 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte @@ -35,6 +35,7 @@ import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte'; import { isFileTypeSupported, filterFilesByModalities } from '$lib/utils'; import { parseFilesToMessageExtras, processFilesToChatUploaded } from '$lib/utils/browser-only'; + import { ErrorDialogType } from '$lib/enums'; import { onMount } from 'svelte'; import { fade, fly, slide } from 'svelte/transition'; import { Trash2, AlertTriangle, RefreshCw } from '@lucide/svelte'; @@ -616,7 +617,7 @@ contextInfo={activeErrorDialog?.contextInfo} onOpenChange={handleErrorDialogOpenChange} open={Boolean(activeErrorDialog)} - type={activeErrorDialog?.type ?? 'server'} + type={(activeErrorDialog?.type as ErrorDialogType) ?? ErrorDialogType.SERVER} /> diff --git a/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte b/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte deleted file mode 100644 index 173685510ff..00000000000 --- a/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte +++ /dev/null @@ -1,26 +0,0 @@ - - - diff --git a/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte b/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte deleted file mode 100644 index bc42f9dd1e8..00000000000 --- a/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte +++ /dev/null @@ -1,97 +0,0 @@ - - -
- -
{@html highlightedHtml}
-
- - diff --git a/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte b/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte index bea1bf6e3f9..f98ba7d78d7 100644 --- a/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte +++ b/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte @@ -1,6 +1,6 @@