Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6415,6 +6415,17 @@ def set_gguf_parameters(self):
self.gguf_writer.add_head_count(0)


@ModelBase.register("MaincoderForCausalLM")
class MaincoderModel(TextModel):
model_arch = gguf.MODEL_ARCH.MAINCODER

def set_gguf_parameters(self):
super().set_gguf_parameters()

if (head_dim := self.hparams.get("head_dim")) is not None:
self.gguf_writer.add_rope_dimension_count(head_dim)


@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-metal/ggml-metal-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2181,7 +2181,11 @@ size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {

const bool has_mask = op->src[3] != nullptr;

if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
// note: the non-vec kernel requires more extra memory, so always reserve for it
GGML_ASSERT(OP_FLASH_ATTN_EXT_NCPSG >= OP_FLASH_ATTN_EXT_VEC_NCPSG);

//if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
if (false) {
// note: always reserve the padding space to avoid graph reallocations
//const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
const bool has_kvpad = true;
Expand Down
17 changes: 15 additions & 2 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2702,7 +2702,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
switch (src0_type) {
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
lut_size = 2*2048;
lut_size = 2*2048 + 4*2048;
break;
case GGML_TYPE_IQ2_XXS:
lut_size = 8*256;
Expand Down Expand Up @@ -3627,6 +3627,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
uint32_t rm_kq = 2;
uint32_t rm_stdq_int = 1;
uint32_t rm_kq_int = 1;
auto const &rm_iq_int = [](uint32_t i) { return i == 0 ? 8u : 4u; };
if (device->vendor_id == VK_VENDOR_ID_AMD) {
if (device->architecture == AMD_GCN) {
rm_stdq = 2;
Expand Down Expand Up @@ -3730,6 +3731,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_q8_1_f32", arr_dmmv_q4_k_q8_1_f32_len[reduc], arr_dmmv_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_q8_1_f32", arr_dmmv_q5_k_q8_1_f32_len[reduc], arr_dmmv_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_q8_1_f32", arr_dmmv_q6_k_q8_1_f32_len[reduc], arr_dmmv_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int, i+1}, 1, true, use_subgroups, subgroup_size_int);

ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_S][i], "mul_mat_vec_iq1_s_q8_1_f32", arr_dmmv_iq1_s_q8_1_f32_len[reduc], arr_dmmv_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_IQ1_M][i], "mul_mat_vec_iq1_m_q8_1_f32", arr_dmmv_iq1_m_q8_1_f32_len[reduc], arr_dmmv_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(i), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(i), i+1}, 1, true, use_subgroups, subgroup_size_int);

}
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
}
Expand Down Expand Up @@ -3776,13 +3781,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_q8_1_f32", arr_dmmv_id_q4_k_q8_1_f32_len[reduc], arr_dmmv_id_q4_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_q8_1_f32", arr_dmmv_id_q5_k_q8_1_f32_len[reduc], arr_dmmv_id_q5_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_q8_1_f32", arr_dmmv_id_q6_k_q8_1_f32_len[reduc], arr_dmmv_id_q6_k_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_kq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_kq_int}, 1, true, use_subgroups, subgroup_size_int);

ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_S], "mul_mat_vec_id_iq1_s_q8_1_f32", arr_dmmv_id_iq1_s_q8_1_f32_len[reduc], arr_dmmv_id_iq1_s_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_IQ1_M], "mul_mat_vec_id_iq1_m_q8_1_f32", arr_dmmv_id_iq1_m_q8_1_f32_len[reduc], arr_dmmv_id_iq1_m_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_iq_int(0), 1, 1}, {wg_size_subgroup_int, 1*rm_iq_int(0)}, 1, true, use_subgroups, subgroup_size_int);
}
#endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
}

#if !defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
GGML_UNUSED(rm_stdq_int);
GGML_UNUSED(rm_kq_int);
GGML_UNUSED(rm_iq_int);
#endif

// dequant shaders
Expand Down Expand Up @@ -5616,6 +5625,8 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
break;
default:
return nullptr;
Expand Down Expand Up @@ -5772,6 +5783,8 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
break;
default:
return nullptr;
Expand Down Expand Up @@ -7037,7 +7050,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
// Quantization overhead is not worth it for small k
switch (device->vendor_id) {
case VK_VENDOR_ID_NVIDIA:
if (src0_type == GGML_TYPE_Q2_K) {
if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) {
return true;
}

Expand Down
11 changes: 11 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
#define K_PER_ITER 8
#elif defined(DATA_A_QUANT_K)
#define K_PER_ITER 16
#elif defined(DATA_A_IQ1_S) || defined(DATA_A_IQ1_M)
#define K_PER_ITER 32
#else
#error unimplemented
#endif
Expand Down Expand Up @@ -49,6 +51,15 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 1];
cache_b_qs[2] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 2];
cache_b_qs[3] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + b_qs_idx * 4 + 3];
#elif K_PER_ITER == 32
cache_b_qs[0] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 ];
cache_b_qs[1] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 1];
cache_b_qs[2] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 2];
cache_b_qs[3] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 3];
cache_b_qs[4] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 4];
cache_b_qs[5] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 5];
cache_b_qs[6] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 6];
cache_b_qs[7] = data_b[b_block_idx_outer].qs[b_block_idx_inner * 8 + 7];
#else
#error unimplemented
#endif
Expand Down
115 changes: 115 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -377,3 +377,118 @@ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
return FLOAT_TYPE(float(cache_b_ds.x) * float(d_scale) * float(q_sum));
}
#endif

#if defined(DATA_A_IQ1_S)
void repack8(uint ib, uint iqs, out i32vec4 out0, out i32vec4 out1) {
const uint ib32 = iqs / 32;

const uint qh = data_a[ib].qh[ib32];

const uint qs16_0 = data_a_packed16[ib].qs[(4 * ib32 + 0) / 2];
const uint qs16_1 = data_a_packed16[ib].qs[(4 * ib32 + 2) / 2];

const uint qs0 = qs16_0 & 0xFF;
const uint qs1 = qs16_0 >> 8;
const uint qs2 = qs16_1 & 0xFF;
const uint qs3 = qs16_1 >> 8;

const uint hi0 = bitfieldExtract(qh, 3 * int(0), 3);
const uint hi1 = bitfieldExtract(qh, 3 * int(1), 3);
const uint hi2 = bitfieldExtract(qh, 3 * int(2), 3);
const uint hi3 = bitfieldExtract(qh, 3 * int(3), 3);

const int32_t grid0 = int32_t(iq1s_grid_gpu[qs0 | (hi0 << 8)]);
const int32_t grid1 = int32_t(iq1s_grid_gpu[qs1 | (hi1 << 8)]);
const int32_t grid2 = int32_t(iq1s_grid_gpu[qs2 | (hi2 << 8)]);
const int32_t grid3 = int32_t(iq1s_grid_gpu[qs3 | (hi3 << 8)]);

out0 = i32vec4((grid0 >> 0) & 0x0F0F0F0F,
(grid0 >> 4) & 0x0F0F0F0F,
(grid1 >> 0) & 0x0F0F0F0F,
(grid1 >> 4) & 0x0F0F0F0F);
out1 = i32vec4((grid2 >> 0) & 0x0F0F0F0F,
(grid2 >> 4) & 0x0F0F0F0F,
(grid3 >> 0) & 0x0F0F0F0F,
(grid3 >> 4) & 0x0F0F0F0F);
}

vec2 get_dm(uint ib, uint iqs) {
const uint ib32 = iqs / 32;

const uint qh = data_a[ib].qh[ib32];
const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;

const float d = float(data_a[ib].d);
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);

// the -1 cancels out the bias in iq1s_grid_gpu
return FLOAT_TYPE_VEC2(dl, dl * (delta - 1));
}

FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
int32_t q_sum = 0;

const uint ib_k = ib_a / 8;
const uint iqs_k = (ib_a % 8) * 32 + iqs * 32;

i32vec4 qs_a0;
i32vec4 qs_a1;
repack8(ib_k, iqs_k, qs_a0, qs_a1);

const vec2 dm = get_dm(ib_k, iqs_k);

q_sum += dotPacked4x8EXT(qs_a0.x, cache_b_qs[0]);
q_sum += dotPacked4x8EXT(qs_a0.y, cache_b_qs[1]);
q_sum += dotPacked4x8EXT(qs_a0.z, cache_b_qs[2]);
q_sum += dotPacked4x8EXT(qs_a0.w, cache_b_qs[3]);
q_sum += dotPacked4x8EXT(qs_a1.x, cache_b_qs[4]);
q_sum += dotPacked4x8EXT(qs_a1.y, cache_b_qs[5]);
q_sum += dotPacked4x8EXT(qs_a1.z, cache_b_qs[6]);
q_sum += dotPacked4x8EXT(qs_a1.w, cache_b_qs[7]);

return FLOAT_TYPE(float(cache_b_ds.x) * float(dm.x) * float(q_sum) + float(dm.y) * float(cache_b_ds.y));
}
#endif

#if defined(DATA_A_IQ1_M)
FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
const uint ib_k = ib_a / 8;
const uint iqs_k = (ib_a % 8) * 32 + iqs * 32;

const uint ib32 = iqs_k / 32;
const uint ib64 = ib32 / 2;

const uint16_t[4] scales = data_a[ib_k].scales;
const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);

const uint qs32 = data_a_packed32[ib_k].qs[ib32];
const uint qh16 = data_a_packed16[ib_k].qh[ib32];

float sum = 0;
const uint sc = data_a[ib_k].scales[ib64];
[[unroll]] for (int l = 0; l < 4; ++l) {
const uint ib16 = 2 * ib32 + l / 2;
const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1);
const uint qh = qh16 >> (4 * l);
const uint qs = (qs32 >> (8 * l)) & 0xFF;
const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;

const int32_t grid = int32_t(iq1s_grid_gpu[qs | ((qh & 7) << 8)]);

int32_t q_sum = 0;
q_sum += dotPacked4x8EXT((grid >> 0) & 0x0F0F0F0F, cache_b_qs[2 * l + 0]);
q_sum += dotPacked4x8EXT((grid >> 4) & 0x0F0F0F0F, cache_b_qs[2 * l + 1]);

int32_t y_sum = 0;
y_sum += dotPacked4x8EXT(int(0x01010101), cache_b_qs[2 * l + 0]);
y_sum += dotPacked4x8EXT(int(0x01010101), cache_b_qs[2 * l + 1]);

// the -1 cancels out the bias in iq1s_grid_gpu
sum += dl * (q_sum + y_sum * (delta - 1));
}
sum *= float(cache_b_ds.x);

return sum;
}
#endif
Loading
Loading