Skip to content

Commit e1fa9dd

Browse files
authored
Merge pull request #3 from TabbyML/support-starcoder-mqa
feat: support starcoder mqa
2 parents 5ca037b + 08f35c4 commit e1fa9dd

File tree

2 files changed

+5
-24
lines changed

2 files changed

+5
-24
lines changed

convert-starcoder-hf-to-gguf.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def parse_args() -> argparse.Namespace:
109109
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
110110
gguf_writer.add_block_count(block_count)
111111
gguf_writer.add_head_count(hparams["n_head"])
112-
gguf_writer.add_head_count_kv(hparams["n_head"])
112+
gguf_writer.add_head_count_kv(1)
113113
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
114114
gguf_writer.add_file_type(ftype)
115115

@@ -209,25 +209,6 @@ def parse_args() -> argparse.Namespace:
209209

210210
data = data.squeeze().numpy()
211211

212-
# TODO: implement MQA directly, instead of duplicate into MHA.
213-
if name.endswith(".attn.c_attn.weight") or name.endswith(".attn.c_attn.bias"):
214-
print("Duplicate K,V heads to use MHA instead of MQA for", name)
215-
216-
embed_dim = hparams["n_embd"]
217-
head_dim = embed_dim // hparams["n_head"]
218-
219-
# ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
220-
q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
221-
# duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
222-
if len(k.shape) == 2:
223-
k = np.tile(k, (hparams["n_head"], 1))
224-
v = np.tile(v, (hparams["n_head"], 1))
225-
elif len(k.shape) == 1:
226-
k = np.tile(k, (hparams["n_head"]))
227-
v = np.tile(v, (hparams["n_head"]))
228-
# concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
229-
data = np.concatenate((q, k, v), axis=0)
230-
231212
# map tensor names
232213
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
233214
if new_name is None:

llama.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2265,8 +2265,8 @@ static void llm_load_tensors(
22652265
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
22662266
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
22672267

2268-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2269-
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {3*n_embd}, backend_split);
2268+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2269+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
22702270

22712271
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
22722272
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
@@ -3538,8 +3538,8 @@ static struct ggml_cgraph * llm_build_starcoder(
35383538
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
35393539

35403540
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3541-
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
3542-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
3541+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3542+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
35433543

35443544
struct ggml_tensor * Qcur = tmpq;
35453545
struct ggml_tensor * Kcur = tmpk;

0 commit comments

Comments
 (0)