From 9f01e7807ab30c31e15607cd0fe2fce07774523e Mon Sep 17 00:00:00 2001 From: "a.r.l" Date: Thu, 2 Jan 2025 19:22:53 +0800 Subject: [PATCH 1/4] feat: add opt model example --- common/common-ggml.cpp | 253 +++ common/common-ggml.h | 18 + examples/gpt-2-sparse/CMakeLists.txt | 15 + examples/gpt-2-sparse/README.md | 158 ++ .../gpt-2-sparse/convert-cerebras-to-ggml.py | 183 ++ examples/gpt-2-sparse/convert-ckpt-to-ggml.py | 159 ++ examples/gpt-2-sparse/convert-h5-to-ggml.py | 195 ++ examples/gpt-2-sparse/download-ggml-model.sh | 69 + examples/gpt-2-sparse/download-model.sh | 48 + examples/gpt-2-sparse/main-30b.cpp | 1593 +++++++++++++++++ examples/gpt-2-sparse/main.cpp_123 | 1592 ++++++++++++++++ examples/gpt-2-sparse/main.cpp_bak | 1546 ++++++++++++++++ examples/gpt-2-sparse/main13b.cpp | 1583 ++++++++++++++++ examples/gpt-2-sparse/main7b.cpp | 1567 ++++++++++++++++ examples/gpt-2-sparse/quantize.cpp | 184 ++ 15 files changed, 9163 insertions(+) create mode 100644 common/common-ggml.cpp create mode 100644 common/common-ggml.h create mode 100644 examples/gpt-2-sparse/CMakeLists.txt create mode 100644 examples/gpt-2-sparse/README.md create mode 100644 examples/gpt-2-sparse/convert-cerebras-to-ggml.py create mode 100644 examples/gpt-2-sparse/convert-ckpt-to-ggml.py create mode 100644 examples/gpt-2-sparse/convert-h5-to-ggml.py create mode 100755 examples/gpt-2-sparse/download-ggml-model.sh create mode 100755 examples/gpt-2-sparse/download-model.sh create mode 100644 examples/gpt-2-sparse/main-30b.cpp create mode 100644 examples/gpt-2-sparse/main.cpp_123 create mode 100644 examples/gpt-2-sparse/main.cpp_bak create mode 100644 examples/gpt-2-sparse/main13b.cpp create mode 100644 examples/gpt-2-sparse/main7b.cpp create mode 100644 examples/gpt-2-sparse/quantize.cpp diff --git a/common/common-ggml.cpp b/common/common-ggml.cpp new file mode 100644 index 00000000..794607c6 --- /dev/null +++ b/common/common-ggml.cpp @@ -0,0 +1,253 @@ +#include "common-ggml.h" + +#include +#include + +static const std::map GGML_FTYPE_MAP = { + {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, + {"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, + {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, + {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, + {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, +}; + +void ggml_print_ftypes(FILE * fp) { + for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) { + fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second); + } +} + +enum ggml_ftype ggml_parse_ftype(const char * str) { + enum ggml_ftype ftype; + if (str[0] == 'q') { + const auto it = GGML_FTYPE_MAP.find(str); + if (it == GGML_FTYPE_MAP.end()) { + fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str); + return GGML_FTYPE_UNKNOWN; + } + ftype = it->second; + } else { + ftype = (enum ggml_ftype) atoi(str); + } + + return ftype; +} + +bool ggml_common_quantize_0( + std::ifstream & finp, + std::ofstream & fout, + const ggml_ftype ftype, + const std::vector & to_quant, + const std::vector & to_skip) { + + ggml_type qtype = GGML_TYPE_F32; + + switch (ftype) { + case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: + case GGML_FTYPE_ALL_F32: + case GGML_FTYPE_MOSTLY_F16: + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: + case GGML_FTYPE_MOSTLY_Q2_K: + case GGML_FTYPE_MOSTLY_Q3_K: + case GGML_FTYPE_MOSTLY_Q4_K: + case GGML_FTYPE_MOSTLY_Q5_K: + case GGML_FTYPE_MOSTLY_Q6_K: + { + fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); + return false; + } + }; + + if (!ggml_is_quantized(qtype)) { + fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype)); + return false; + } + + size_t total_size_org = 0; + size_t total_size_new = 0; + + std::vector work; + + std::vector data_u8; + std::vector data_f16; + std::vector data_f32; + + std::vector hist_all(1 << 4, 0); + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + finp.read(reinterpret_cast(&length), sizeof(length)); + finp.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (finp.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[4] = { 1, 1, 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + finp.read (&name[0], length); + + printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); + + bool quantize = false; + + // check if we should quantize this tensor + for (const auto & s : to_quant) { + if (std::regex_match(name, std::regex(s))) { + quantize = true; + break; + } + } + + // check if we should skip this tensor + for (const auto & s : to_skip) { + if (std::regex_match(name, std::regex(s))) { + quantize = false; + break; + } + } + + // quantize only 2D tensors + quantize &= (n_dims == 2); + + if (quantize) { + if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { + fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); + return false; + } + + if (ttype == GGML_TYPE_F16) { + data_f16.resize(nelements); + finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); + data_f32.resize(nelements); + for (int i = 0; i < nelements; ++i) { + data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); + } + } else { + data_f32.resize(nelements); + finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); + } + + ttype = qtype; + } else { + // const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t); + int bpe = -1; + if (ttype == 0 || ttype == 18) { + bpe = sizeof(float); + } + else { + bpe = sizeof(uint16_t); + } + + data_u8.resize(nelements*bpe); + finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); + } + + fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); + fout.write(reinterpret_cast(&length), sizeof(length)); + fout.write(reinterpret_cast(&ttype), sizeof(ttype)); + for (int i = 0; i < n_dims; ++i) { + fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + fout.write(&name[0], length); + + if (quantize) { + work.resize(nelements); // for quantization + + size_t cur_size = 0; + std::vector hist_cur(1 << 4, 0); + + switch ((ggml_type) ttype) { + case GGML_TYPE_Q4_0: + { + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: + { + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_0: + { + cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_1: + { + cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q8_0: + { + cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_COUNT: + { + fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); + return false; + } + } + + fout.write(reinterpret_cast(work.data()), cur_size); + total_size_new += cur_size; + + printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + for (int i = 0; i < (int) hist_cur.size(); ++i) { + hist_all[i] += hist_cur[i]; + } + + for (int i = 0; i < (int) hist_cur.size(); ++i) { + printf("%5.3f ", hist_cur[i] / (float)nelements); + } + printf("\n"); + } else { + printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); + fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); + total_size_new += data_u8.size(); + } + + total_size_org += nelements * sizeof(float); + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); + + { + int64_t sum_all = 0; + for (int i = 0; i < (int) hist_all.size(); ++i) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (int i = 0; i < (int) hist_all.size(); ++i) { + printf("%5.3f ", hist_all[i] / (float)sum_all); + } + printf("\n"); + } + + return true; +} diff --git a/common/common-ggml.h b/common/common-ggml.h new file mode 100644 index 00000000..29ba4ad5 --- /dev/null +++ b/common/common-ggml.h @@ -0,0 +1,18 @@ +#pragma once + +#include "ggml.h" + +#include +#include +#include + +enum ggml_ftype ggml_parse_ftype(const char * str); + +void ggml_print_ftypes(FILE * fp = stderr); + +bool ggml_common_quantize_0( + std::ifstream & finp, + std::ofstream & fout, + const ggml_ftype ftype, + const std::vector & to_quant, + const std::vector & to_skip); \ No newline at end of file diff --git a/examples/gpt-2-sparse/CMakeLists.txt b/examples/gpt-2-sparse/CMakeLists.txt new file mode 100644 index 00000000..a06b42dc --- /dev/null +++ b/examples/gpt-2-sparse/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# gpt-2 + +set(TEST_TARGET gpt-2-sparse) +add_executable(${TEST_TARGET} main7b.cpp) +# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) +target_link_libraries(${TEST_TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) + +# +# gpt-2-quantize + +set(TEST_TARGET gpt-2-quantize) +add_executable(${TEST_TARGET} quantize.cpp) +# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) +target_link_libraries(${TEST_TARGET} PRIVATE ggml common) diff --git a/examples/gpt-2-sparse/README.md b/examples/gpt-2-sparse/README.md new file mode 100644 index 00000000..509fabc5 --- /dev/null +++ b/examples/gpt-2-sparse/README.md @@ -0,0 +1,158 @@ +# gpt-2 + +This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library. + +The program runs on the CPU - no video card is required. + +The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported. + +The example supports the following GPT-2 models: + +| Model | Description | Disk Size | +| --- | --- | --- | +| 117M | Small model | 240 MB | +| 345M | Medium model | 680 MB | +| 774M | Large model | 1.5 GB | +| 1558M | XL model | 3.0 GB | + +Sample performance on MacBook M1 Pro: + +| Model | Size | Time / Token | +| --- | --- | --- | +| GPT-2 | 117M | 5 ms | +| GPT-2 | 345M | 12 ms | +| GPT-2 | 774M | 23 ms | +| GPT-2 | 1558M | 42 ms | + +*TODO: add tables for Cerebras-GPT models* + +Sample output: + +``` +$ ./bin/gpt-2 -h +usage: ./bin/gpt-2 [options] + +options: + -h, --help show this help message and exit + -s SEED, --seed SEED RNG seed (default: -1) + -t N, --threads N number of threads to use during computation (default: 8) + -p PROMPT, --prompt PROMPT + prompt to start generation with (default: random) + -n N, --n_predict N number of tokens to predict (default: 200) + --top_k N top-k sampling (default: 40) + --top_p N top-p sampling (default: 0.9) + --temp N temperature (default: 1.0) + -b N, --batch_size N batch size for prompt processing (default: 8) + -m FNAME, --model FNAME + model path (default: models/gpt-2-117M/ggml-model.bin) + +$ ./bin/gpt-2 +gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin' +gpt2_model_load: n_vocab = 50257 +gpt2_model_load: n_ctx = 1024 +gpt2_model_load: n_embd = 768 +gpt2_model_load: n_head = 12 +gpt2_model_load: n_layer = 12 +gpt2_model_load: f16 = 1 +gpt2_model_load: ggml ctx size = 311.12 MB +gpt2_model_load: memory size = 72.00 MB, n_mem = 12288 +gpt2_model_load: model size = 239.08 MB +main: number of tokens in prompt = 1 + +So this is going to be the end of the line for us. + +If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan. + +Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs. + +We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|> + +main: mem per token = 2048612 bytes +main: load time = 106.32 ms +main: sample time = 7.10 ms +main: predict time = 506.40 ms / 5.06 ms per token +main: total time = 629.84 ms +``` + +## Downloading and converting the original models (GPT-2) + +You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are +in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done +via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script. + +Here is the entire process for the GPT-2 117M model (download from official site + conversion): + +``` +cd ggml/build +../examples/gpt-2/download-model.sh 117M + +Downloading model 117M ... +models/gpt-2-117M/checkpoint 100%[=============================>] 77 --.-KB/s in 0s +models/gpt-2-117M/encoder.json 100%[=============================>] 1018K 1.20MB/s in 0.8s +models/gpt-2-117M/hparams.json 100%[=============================>] 90 --.-KB/s in 0s +models/gpt-2-117M/model.ckpt.data-00000-of-00001 100%[=============================>] 474.70M 1.21MB/s in 8m 39s +models/gpt-2-117M/model.ckpt.index 100%[=============================>] 5.09K --.-KB/s in 0s +models/gpt-2-117M/model.ckpt.meta 100%[=============================>] 460.11K 806KB/s in 0.6s +models/gpt-2-117M/vocab.bpe 100%[=============================>] 445.62K 799KB/s in 0.6s +Done! Model '117M' saved in 'models/gpt-2-117M/' + +Run the convert-ckpt-to-ggml.py script to convert the model to ggml format. + + python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1 + +``` + +This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid +this, you can download the already converted ggml models as described below. + +## Downloading and converting the original models (Cerebras-GPT) + +Clone the respective repository from here: https://huggingface.co/cerebras + +Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format: + +``` +cd ggml/build +git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/ +python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/ + +``` + +## Downloading the ggml model directly (GPT-2) + +For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This +way, you can directly download a single binary file and start using it. No python or Tensorflow is required. + +Here is how to get the 117M ggml model: + +``` +cd ggml/build +../examples/gpt-2/download-ggml-model.sh 117M + +Downloading ggml model 117M ... +models/gpt-2-117M/ggml-model.bin 100%[===============================>] 239.58M 8.52MB/s in 28s +Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin' +You can now use it like this: + + $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example" + +``` + +At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above. + +## Quantizing the models + +You can also try to quantize the `ggml` models via 4-bit integer quantization. +Keep in mind that for smaller models, this will render them completely useless. +You generally want to quantize larger models. + +``` +# quantize GPT-2 F16 to Q4_0 (faster but less precise) +./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2 +./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example" + +# quantize Cerebras F16 to Q4_1 (slower but more precise) +./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3 +./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example" + +``` diff --git a/examples/gpt-2-sparse/convert-cerebras-to-ggml.py b/examples/gpt-2-sparse/convert-cerebras-to-ggml.py new file mode 100644 index 00000000..6057f81c --- /dev/null +++ b/examples/gpt-2-sparse/convert-cerebras-to-ggml.py @@ -0,0 +1,183 @@ +# Convert Cerebras models to ggml format +# +# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/ +# + +import sys +import struct +import json +import torch +import numpy as np +import re + +from transformers import AutoModelForCausalLM + +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + +if len(sys.argv) < 2: + print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n") + sys.exit(1) + +# output in the same directory as the model +dir_model = sys.argv[1] +fname_out = sys.argv[1] + "/ggml-model-f16.bin" + +with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + encoder = json.load(f) + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +# use 16-bit or 32-bit floats +use_f16 = True +if len(sys.argv) > 2: + use_f16 = False + fname_out = sys.argv[1] + "/ggml-model-f32.bin" + +model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) +#print (model) + +list_vars = model.state_dict() +#print (list_vars) + +print(hparams) + +fout = open(fname_out, "wb") + +fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", hparams["vocab_size"])) +fout.write(struct.pack("i", hparams["n_positions"])) +fout.write(struct.pack("i", hparams["n_embd"])) +fout.write(struct.pack("i", hparams["n_head"])) +fout.write(struct.pack("i", hparams["n_layer"])) +fout.write(struct.pack("i", use_f16)) + +byte_encoder = bytes_to_unicode() +byte_decoder = {v:k for k, v in byte_encoder.items()} + +fout.write(struct.pack("i", len(encoder))) + +for key in encoder: + text = bytearray([byte_decoder[c] for c in key]) + fout.write(struct.pack("i", len(text))) + fout.write(text) + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + print("Processing variable: " + name + " with shape: ", data.shape) + + # rename headers to keep compatibility + if name == "transformer.ln_f.weight": + name = "model/ln_f/g" + elif name == "transformer.ln_f.bias": + name = "model/ln_f/b" + elif name == "transformer.wte.weight": + name = "model/wte" + elif name == "transformer.wpe.weight": + name = "model/wpe" + elif name == "lm_head.weight": + name = "model/lm_head" + elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_1/g" + elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_1/b" + elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_attn/w" + elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_attn/b" + elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_proj/w" + elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_proj/b" + elif re.match(r"transformer.h.\d+.ln_2.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_2/g" + elif re.match(r"transformer.h.\d+.ln_2.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_2/b" + elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_fc/w" + elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_fc/b" + elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_proj/w" + elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_proj/b" + else: + print("Unrecognized variable name. %s", name) + + # we don't need these + if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"): + print(" Skipping variable: " + name) + continue + + n_dims = len(data.shape); + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype = 0; + if use_f16: + if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype = 0 + + # for efficiency - transpose the projection matrices + # "model/h.*/attn/c_attn/w" + # "model/h.*/attn/c_proj/w" + # "model/h.*/mlp/c_fc/w" + # "model/h.*/mlp/c_proj/w" + if name[-14:] == "/attn/c_attn/w" or \ + name[-14:] == "/attn/c_proj/w" or \ + name[-11:] == "/mlp/c_fc/w" or \ + name[-13:] == "/mlp/c_proj/w": + print(" Transposing") + data = data.transpose() + + # header + str = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(str), ftype)) + for i in range(n_dims): + fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) + fout.write(str); + + # data + data.tofile(fout) + +fout.close() + +print("Done. Output file: " + fname_out) +print("") diff --git a/examples/gpt-2-sparse/convert-ckpt-to-ggml.py b/examples/gpt-2-sparse/convert-ckpt-to-ggml.py new file mode 100644 index 00000000..9113141f --- /dev/null +++ b/examples/gpt-2-sparse/convert-ckpt-to-ggml.py @@ -0,0 +1,159 @@ +# Convert a model checkpoint to a ggml compatible file +# +# Load the model using TensorFlow. +# Iterate over all variables and write them to a binary file. +# +# For each variable, write the following: +# - Number of dimensions (int) +# - Name length (int) +# - Dimensions (int[n_dims]) +# - Name (char[name_length]) +# - Data (float[n_dims]) +# +# By default, the bigger matrices are converted to 16-bit floats. +# This can be disabled by adding the "use-f32" CLI argument. +# +# At the start of the ggml file we write the model parameters +# and vocabulary. +# + +import sys +import json +import struct +import numpy as np +import tensorflow as tf + +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + +# helper method to convert a numpy array to different float types +def convert_to_ftype(data, ftype): + # fp16 + if ftype == 1: + return data.astype(np.float16) + + assert False, "Invalid ftype: " + str(ftype) + +if len(sys.argv) < 3: + print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n") + print(" ftype == 0 -> float32") + print(" ftype == 1 -> float16") + sys.exit(1) + +# output in the same directory as the model +dir_model = sys.argv[1] +fname_out = sys.argv[1] + "/ggml-model.bin" + +with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f: + encoder = json.load(f) + +with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if len(sys.argv) > 2: + ftype = int(sys.argv[2]) + if ftype < 0 or ftype > 1: + print("Invalid ftype: " + str(ftype)) + sys.exit(1) + fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + +list_vars = tf.train.list_variables(dir_model) + +fout = open(fname_out, "wb") + +fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", hparams["n_vocab"])) +fout.write(struct.pack("i", hparams["n_ctx"])) +fout.write(struct.pack("i", hparams["n_embd"])) +fout.write(struct.pack("i", hparams["n_head"])) +fout.write(struct.pack("i", hparams["n_layer"])) +fout.write(struct.pack("i", ftype)) + +byte_encoder = bytes_to_unicode() +byte_decoder = {v:k for k, v in byte_encoder.items()} + +fout.write(struct.pack("i", len(encoder))) + +for key in encoder: + text = bytearray([byte_decoder[c] for c in key]) + fout.write(struct.pack("i", len(text))) + fout.write(text) + +for name, shape in list_vars: + print("Processing variable: " + name + " with shape: ", shape) + + data = tf.train.load_variable(dir_model, name).squeeze() + n_dims = len(data.shape); + + # for efficiency - transpose the projection matrices + # "model/h.*/attn/c_attn/w" + # "model/h.*/attn/c_proj/w" + # "model/h.*/mlp/c_fc/w" + # "model/h.*/mlp/c_proj/w" + if name[-14:] == "/attn/c_attn/w" or \ + name[-14:] == "/attn/c_proj/w" or \ + name[-11:] == "/mlp/c_fc/w" or \ + name[-13:] == "/mlp/c_proj/w": + print(" Transposing") + data = data.transpose() + + dshape = data.shape + + ftype_cur = 0 + if ftype != 0: + # match name: + # "model/wte" + # "model/h.*/attn/c_attn/w" + # "model/h.*/attn/c_proj/w" + # "model/h.*/mlp/c_fc/w" + # "model/h.*/mlp/c_proj/w" + if name == "model/wte" or name[-2:] == "/w": + print(" Converting to " + ftype_str[ftype]) + data = convert_to_ftype(data, ftype) + ftype_cur = ftype + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + # header + str = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) + for i in range(n_dims): + fout.write(struct.pack("i", dshape[n_dims - 1 - i])) + fout.write(str); + + # data + data.tofile(fout) + +fout.close() + +print("Done. Output file: " + fname_out) +print("") diff --git a/examples/gpt-2-sparse/convert-h5-to-ggml.py b/examples/gpt-2-sparse/convert-h5-to-ggml.py new file mode 100644 index 00000000..6a2b8654 --- /dev/null +++ b/examples/gpt-2-sparse/convert-h5-to-ggml.py @@ -0,0 +1,195 @@ +# Convert GPT-2 h5 transformer model to ggml format +# +# Load the model using GPT2Model. +# Iterate over all variables and write them to a binary file. +# +# For each variable, write the following: +# - Number of dimensions (int) +# - Name length (int) +# - Dimensions (int[n_dims]) +# - Name (char[name_length]) +# - Data (float[n_dims]) +# +# By default, the bigger matrices are converted to 16-bit floats. +# This can be disabled by adding the "use-f32" CLI argument. +# +# At the start of the ggml file we write the model parameters +# and vocabulary. +# + +import sys +import struct +import json +import numpy as np +import re + +from transformers import GPT2Model + +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + +if len(sys.argv) < 2: + print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") + sys.exit(1) + +# output in the same directory as the model +dir_model = sys.argv[1] +fname_out = sys.argv[1] + "/ggml-model.bin" + +with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + encoder = json.load(f) + +with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: + encoder_added = json.load(f) + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +# use 16-bit or 32-bit floats +use_f16 = True +if len(sys.argv) > 2: + use_f16 = False + fname_out = sys.argv[1] + "/ggml-model-f32.bin" + +model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True) +#print (model) + +list_vars = model.state_dict() +#print (list_vars) + +fout = open(fname_out, "wb") + +fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", hparams["vocab_size"])) +fout.write(struct.pack("i", hparams["n_positions"])) +fout.write(struct.pack("i", hparams["n_embd"])) +fout.write(struct.pack("i", hparams["n_head"])) +fout.write(struct.pack("i", hparams["n_layer"])) +#fout.write(struct.pack("i", hparams["rotary_dim"])) +fout.write(struct.pack("i", use_f16)) + +byte_encoder = bytes_to_unicode() +byte_decoder = {v:k for k, v in byte_encoder.items()} + +fout.write(struct.pack("i", len(encoder) + len(encoder_added))) + +for key in encoder: + text = bytearray([byte_decoder[c] for c in key]) + fout.write(struct.pack("i", len(text))) + fout.write(text) + +for key in encoder_added: + text = bytearray([byte_decoder[c] for c in key]) + fout.write(struct.pack("i", len(text))) + fout.write(text) + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + print("Processing variable: " + name + " with shape: ", data.shape) + + # we don't need these + if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"): + print(" Skipping variable: " + name) + continue + + n_dims = len(data.shape); + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype = 0; + if use_f16: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype = 0 + + # for efficiency - transpose these matrices: + # "transformer.h.*.mlp.c_proj.weight + if name.endswith(".mlp.c_proj.weight"): + print(" Transposing") + data = data.transpose() + + # rename headers to keep compatibility + if name == "ln_f.weight": + name = "model/ln_f/g" + elif name == "ln_f.bias": + name = "model/ln_f/b" + elif name == "wte.weight": + name = "model/wte" + elif name == "wpe.weight": + name = "model/wpe" + elif re.match(r"h\.\d+\.ln_1\.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_1/g" + elif re.match(r"h\.\d+\.ln_1\.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_1/b" + elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_attn/w" + elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_attn/b" + elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_proj/w" + elif re.match(r"h.\d+.attn.c_proj.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/attn/c_proj/b" + elif re.match(r"h.\d+.ln_2.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_2/g" + elif re.match(r"h.\d+.ln_2.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/ln_2/b" + elif re.match(r"h.\d+.mlp.c_fc.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_fc/w" + elif re.match(r"h.\d+.mlp.c_fc.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_fc/b" + elif re.match(r"h.\d+.mlp.c_proj.weight", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_proj/w" + elif re.match(r"h.\d+.mlp.c_proj.bias", name): + i = re.findall("\d+", name)[0] + name = f"model/h{i}/mlp/c_proj/b" + else: + print("Unrecognized variable name. %s", name) + + str = name.encode('utf-8') + + fout.write(struct.pack("iii", n_dims, len(str), ftype)) + for i in range(n_dims): + fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) + fout.write(str); + + # data + data.tofile(fout) + +fout.close() + +print("Done. Output file: " + fname_out) +print("") diff --git a/examples/gpt-2-sparse/download-ggml-model.sh b/examples/gpt-2-sparse/download-ggml-model.sh new file mode 100755 index 00000000..3aae015b --- /dev/null +++ b/examples/gpt-2-sparse/download-ggml-model.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# This script downloads GPT-2 model files that have already been converted to ggml format. +# This way you don't have to convert them yourself. +# +# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead. + +#src="https://ggml.ggerganov.com" +#pfx="ggml-model-gpt-2" + +src="https://huggingface.co/ggerganov/ggml" +pfx="resolve/main/ggml-model-gpt-2" + +ggml_path=$(dirname $(realpath $0)) + +# GPT-2 models +models=( "117M" "345M" "774M" "1558M" ) + +# list available models +function list_models { + printf "\n" + printf " Available models:" + for model in "${models[@]}"; do + printf " $model" + done + printf "\n\n" +} + +if [ "$#" -ne 1 ]; then + printf "Usage: $0 \n" + list_models + + exit 1 +fi + +model=$1 + +if [[ ! " ${models[@]} " =~ " ${model} " ]]; then + printf "Invalid model: $model\n" + list_models + + exit 1 +fi + +# download ggml model + +printf "Downloading ggml model $model ...\n" + +mkdir -p models/gpt-2-$model + +if [ -x "$(command -v wget)" ]; then + wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin +elif [ -x "$(command -v curl)" ]; then + curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin +else + printf "Either wget or curl is required to download models.\n" + exit 1 +fi + +if [ $? -ne 0 ]; then + printf "Failed to download ggml model $model \n" + printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n" + exit 1 +fi + +printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n" +printf "You can now use it like this:\n\n" +printf " $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n" +printf "\n" diff --git a/examples/gpt-2-sparse/download-model.sh b/examples/gpt-2-sparse/download-model.sh new file mode 100755 index 00000000..f0c62f4f --- /dev/null +++ b/examples/gpt-2-sparse/download-model.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +ggml_path=$(dirname $(realpath $0)) + +# GPT-2 models +models=( "117M" "345M" "774M" "1558M" ) + +# list available models +function list_models { + printf "\n" + printf " Available models:" + for model in "${models[@]}"; do + printf " $model" + done + printf "\n\n" +} + +if [ "$#" -ne 1 ]; then + printf "Usage: $0 \n" + list_models + + exit 1 +fi + +model=$1 + +if [[ ! " ${models[@]} " =~ " ${model} " ]]; then + printf "Invalid model: $model\n" + list_models + + exit 1 +fi + +# download model + +printf "Downloading model $model ...\n" + +mkdir -p models/gpt-2-$model + +for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do + wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file +done + +printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n" +printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n" +printf "\n" +printf " python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n" +printf "\n" diff --git a/examples/gpt-2-sparse/main-30b.cpp b/examples/gpt-2-sparse/main-30b.cpp new file mode 100644 index 00000000..73eeff25 --- /dev/null +++ b/examples/gpt-2-sparse/main-30b.cpp @@ -0,0 +1,1593 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-cuda.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif +typedef void (*offload_func_t)(struct ggml_tensor * tensor); +void opt_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} +// default hparams (GPT-2 117M) +struct gpt2_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; + float eps = 1e-5f; +}; + +struct gpt2_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + // struct ggml_tensor * c_attn_attn_w; + // struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_attn_q_w; + struct ggml_tensor * c_attn_attn_q_b; + + struct ggml_tensor * c_attn_attn_k_w; + struct ggml_tensor * c_attn_attn_k_b; + + struct ggml_tensor * c_attn_attn_v_w; + struct ggml_tensor * c_attn_attn_v_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; + + struct ggml_tensor * gpu_idx; + struct ggml_tensor * gpu_bucket; + // gpu heat + struct ggml_tensor * c_mlp_fc_w_gpu; + struct ggml_tensor * c_mlp_proj_w_t; + struct ggml_tensor * c_mlp_proj_w_gpu; + + //predictor + struct ggml_tensor * mlp_pre_w1_w; + struct ggml_tensor * mlp_pre_w2_w; +}; + +struct opt_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + opt_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error("opt_file fail\n"); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + ~opt_file() { + if (fp) { + std::fclose(fp); + } + } +}; +#define _POSIX_MAPPED_FILES +#include +#include + +struct opt_mmap { + void * addr; + size_t size; + + opt_mmap(const opt_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error("mmap failed\n"); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~opt_mmap() { + munmap(addr, size); + } +#else + static constexpr bool SUPPORTED = false; + + opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +struct gpt2_model { + gpt2_hparams hparams; + struct opt_file * file; + struct opt_mmap * mapping; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +struct ggml_context * ctx0 = nullptr; +// std::vector compute_buffer; +void *compute_buffer; + +bool endsWith(const std::string& str, const std::string& suffix) { + if (str.length() < suffix.length()) { + return false; + } + return str.substr(str.length() - suffix.length()) == suffix; +} + + +// load the model's weights from a file +bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + model.file = new opt_file(fname.c_str(), "rb"); + printf("size %d\n", model.file->size); + model.mapping = new opt_mmap(model.file, 0, false); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + /* int32_t n_vocab = 0; */ + /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ + + /* if (n_vocab != model.hparams.n_vocab) { */ + /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ + /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ + /* return false; */ + /* } */ + int32_t n_vocab = model.hparams.n_vocab; + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + printf("wtype %d\n", wtype); + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + //need refactor + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); + //predictor + ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size = 0; + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 12*n_layer)*51200; // object overhead + + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + int main_gpu = 0; +#if defined(GGML_USE_CUBLAS) + fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + ggml_cuda_set_main_device(main_gpu); +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT +#else +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU +#endif + + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; + // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; + + // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); + // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // model.lm_head->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["output_norm.weight"] = &model.ln_f_g; + model.tensors["output_norm.bias"] = &model.ln_f_b; + + model.tensors["tok_embeddings.weight"] = &model.wte; + model.tensors["pos_embeddings.weight"] = &model.wpe; + model.tensors["output.weight"] = &model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + memset(&layer, 0, sizeof(gpt2_layer)); + + // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + // // need refine + // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); + // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); + // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); + + // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); + // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); + + // if (i <= 10) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); + // } else if (i <= 12) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); + // } else if (i <= 18) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); + + // } else if (i <= 21) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); + // } else if (i <= 26) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); + // } else if (i <= 31) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); + // } + + // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; + + // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; + // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; + model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; + + model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; + model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; + model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; + model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; + model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; + model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; + + model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; + model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; + model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; + } + } + + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + #ifdef GGML_USE_CUBLAS + // ggml_cuda_assign_buffers_no_scratch(model.memory_k); + // ggml_cuda_assign_buffers_no_scratch(model.memory_v); + #endif + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + ggml_set_no_alloc(ctx, true); + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + const std::vector to_gpu = { + "output_norm.bias", + "output_norm.weight", + ".*attention.wq.weight", + ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + // ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + "layers.*output_norm.weight", + "layers.*output_norm.bias", + ".*fc1.weight", + ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_gpu_lv = { + // ".*attention.wq.weight", + // ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + // "layers.*output_norm.weight", + // "layers.*output_norm.bias", + ".*fc1.weight", + ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_lock = { + "tok_embeddings.weight", + "pos_embeddings.weight", + // "output_norm.bias", + ".*attention.wq.weight", + ".*attention.wq.bias", + // ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + // ".*attention.wo.bias", + ".*feed_forward.w1.weight", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_transpose", + // ".*feed_forward.w2.weight", + ".*feed_forward.w2.bias", + ".*gpu.weight", + ".*attention_norm.weight", + ".*attention_norm.bias", + ".*output_norm.weight", + ".*output_norm.bias", + ".*attention.*fc1.weight", + ".*attention.*fc1.bias", + ".*attention.*fc2.weight", + ".*attention.*fc2.bias", + // ".*w2.bias", + // ".*w1.bias", + "output.weight", + }; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + int64_t new_ne[2]; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + new_ne[i] = ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); + return false; + } + ggml_tensor ** ptr = model.tensors[name]; + // printf("name %s ptr %p\n", name.c_str(), *ptr); + // int k; + // scanf("%d", &k); + *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); + + auto tensor = (ggml_tensor *)*model.tensors[name]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + + + // for debugging + if (1) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + std::streampos offset = fin.tellg(); + // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + tensor->data = model.mapping->addr + static_cast(offset); + // if ( endsWith(name.c_str(), "weight_transpose")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i+4096]); + // } + // } + // printf("\n"); + // if (endsWith(name.c_str(), "weight_h20")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i]); + + // } + // int k; + // scanf("%d", &k); + // } + + // // GPT-2 models share the WTE tensor as the LM head + // if (name == "model/wte" && has_lm_head == false) { + // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + // } + + // if (name == "model/lm_head") { + // has_lm_head = true; + // } + if (model_params.low_vram == false) { + for (const auto &s : to_gpu) + { + // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight"))) + // { + // std::regex pattern(R"(\d+)"); + // std::smatch match; + // int layer_id = 0; + // if (std::regex_search(name, match, pattern)) + // { + // std::string digitStr = match.str(); + // int num = std::stoi(digitStr); + // layer_id = num; + // } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + // if (layer_id > model_params.n_gpu_layers) + // break; + // } + if (std::regex_search(name, std::regex(s))) + { + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + } else { + for (const auto &s : to_gpu_lv) + { + if (std::regex_search(name, std::regex(s))) + { + std::regex pattern(R"(\d+)"); + std::smatch match; + int layer_id = 0; + if (std::regex_search(name, match, pattern)) + { + std::string digitStr = match.str(); + int num = std::stoi(digitStr); + layer_id = num; + } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + if (layer_id > model_params.n_gpu_layers) + break; + // printf("name %s\n", name.c_str()); + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + + } + if (tensor->backend == GGML_BACKEND_GPU) { + #if defined(GGML_USE_CUBLAS) + ggml_cuda_transform_tensor(tensor->data, tensor); + #endif + } + for (const auto &s : to_lock) + { + if (std::regex_match(name, std::regex(s))) + { + if(!mlock(tensor->data, ggml_nbytes(tensor))) { + // printf("mlock %s\n", name.c_str()); + } + else { + printf("mlock failed %s\n", name.c_str()); + } + } + } + + total_size += ggml_nbytes(tensor); + } + ggml_set_no_alloc(ctx, false); + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + printf("load finish\n"); + // int k; + // scanf("%d", &k); + + fin.close(); + + return true; +} + +// build the computation graph +struct ggml_cgraph * gpt2_graph( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_past, + const std::vector & embd_inp) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + // static std::vector buf(buf_size); + static void * buf = ggml_cuda_host_malloc(buf_size); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + ctx0 = ggml_init(params); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, embd); + + // avoid writing to tensors if we are only measuring the memory usage + if (!ggml_allocr_is_measure(allocr)) { + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, position); + if (!ggml_allocr_is_measure(allocr)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i + 2; + } + } + offload_func_t offload_func = opt_nop; + offload_func_t offload_func_kq = opt_nop; + offload_func_t offload_func_v = opt_nop; + offload_func_t offload_func_nr = opt_nop; + offload_func_t offload_debug = opt_nop; +#ifdef GGML_USE_CUBLAS + offload_debug = ggml_cuda_assign_buffers_no_alloc; + // offload_func = ggml_cuda_assign_buffers_no_alloc; + // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + // offload_func_v = ggml_cuda_assign_buffers_no_alloc; + // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; +#endif + // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; + // int k; + // scanf("%d", &k); + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(allocr, KQ_scale); + if (!ggml_allocr_is_measure(allocr)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + ggml_set_name(inpL, "inpL_first"); + // offload_func(inpL); + + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL, hparams.eps); + offload_func(cur); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_1_g); + offload_func(cur); + ggml_set_name(cur, "ln_1_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_1_b); + ggml_set_name(cur, "ln_1_b"); + // offload_func(cur); + + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + + struct ggml_tensor *k_cpy = nullptr; + struct ggml_tensor *v_cpy = nullptr; + // self-attention + { + // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); + offload_func_kq(Qcur); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); + offload_func_kq(Kcur); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); + offload_func_kq(Kcur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); + offload_func_v(Vcur); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); + offload_func_v(Vcur); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + offload_func_v(Vcur); + + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + offload_func_kq(k); + // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); + + offload_func_v(v); + k_cpy = ggml_cpy(ctx0, Kcur, k); + offload_func_kq(k_cpy); + ggml_set_name(k_cpy, "k_cpy"); + v_cpy = ggml_cpy(ctx0, Vcur, v); + offload_func_v(v_cpy); + ggml_set_name(v_cpy, "v_cpy"); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); + offload_func_kq(Qcur); + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + offload_func_kq(Q); + + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + // struct ggml_tensor * K = + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, model.memory_k, + 128, n_past + N, n_head, + ggml_element_size(model.memory_k)*n_embd, + ggml_element_size(model.memory_k)*128, + ggml_element_size(model.memory_k)*n_embd*n_ctx*il); + K->src[1] = k_cpy; + offload_func_kq(K); + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + KQ_scale); + offload_func_kq(KQ_scaled); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, 128, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*128, + n_ctx*ggml_element_size(model.memory_k)*n_embd*il); + V->src[1] = v_cpy; + offload_func_v(V); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merge_cont"); + offload_func_v(cur); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + ggml_set_name(cur, "attn_proj"); + offload_func(cur); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_attn_proj_b); + ggml_set_name(cur, "attn_bias"); + offload_func(cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + ggml_set_name(cur, "after attn"); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + ggml_tensor *idx = nullptr; + ggml_tensor *idx_g = nullptr; + ggml_tensor *cur_c = nullptr; + + // norm + { + cur = ggml_norm(ctx0, inpFF, hparams.eps); + offload_func(cur); + ggml_set_name(cur, "norm_FFN"); + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_2_g); + offload_func(cur); + ggml_set_name(cur, "norm_FFN_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_2_b); + // offload_func(cur); + // ggml_set_name(cur, "norm_FFN_w"); + // cur_c = ggml_dup(ctx0, cur); + } + // if (N == 1) + if (1) + { + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w1_w, + inpFF); + offload_func(idx); + ggml_set_name(idx, "mlp_pre_w1"); + idx = ggml_relu(ctx0, idx); + offload_func(idx); + ggml_set_name(idx, "relu_pre"); + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w2_w, + idx); + ggml_set_name(idx, "mlp_pre_w2"); + // offload_func(idx); + // idx = ggml_sigmoid(ctx0, idx); + // offload_func(idx); + // idx_g = idx; + // idx = ggml_dup(ctx0, idx_g); + // ggml_set_name(idx, "idx_cpu_dup"); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + if (N >= 80) + // if (0) + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + offload_debug(cur); + offload_func(cur); + ggml_set_name(cur, "up_ffn"); + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_fc_b); + offload_debug(cur); + offload_func(cur); + } + else + { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_fc_w, + // cur); + // offload_func(cur); + // cur = ggml_add(ctx0, + // cur, + // model.layers[il].c_mlp_fc_b); + // offload_func(cur); + + + struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, + model.layers[il].c_mlp_fc_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "mlp_up_gpu"); + offload_func(tmp); + offload_debug(tmp); + cur = ggml_mul_mat_idx(ctx0, + model.layers[il].c_mlp_fc_w, + cur, + idx, + model.layers[il].gpu_idx); + ggml_set_name(cur, "mlp_up_cpu"); + cur = ggml_add_idx(ctx0, + cur, + model.layers[il].c_mlp_fc_b, + idx); + ggml_set_name(tmp, "mlp_up_bias"); + offload_debug(tmp); + offload_func(tmp); + + cur = ggml_add(ctx0, cur, tmp); + ggml_set_name(cur, "mlp_up_mix"); + offload_func(cur); + + // cur = tmp; + + } + + + + // GELU activation + // [3072, N] + cur = ggml_relu(ctx0, cur); + // cur_c = cur; + // offload_func(cur); + cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + if (N >= 80) { + // if (0) { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_proj_w, + // cur); + cur = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_t, + cur, + NULL, + NULL); + offload_debug(cur); + offload_func(cur); + ggml_set_name(cur, "down_ffn"); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_proj_b); + offload_func(cur); + offload_debug(cur); + } + else { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_proj_w, + // cur); + // offload_func(cur); + + // cur = ggml_axpy(ctx0, + // model.layers[il].c_mlp_proj_w_t, + // cur, + // NULL, + // NULL); + // offload_func(cur); + + + // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, + // model.layers[il].c_mlp_proj_w_gpu, + // cur, + // model.layers[il].gpu_bucket, + // NULL); + struct ggml_tensor *tmp = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "axpy"); + offload_func(tmp); + offload_debug(tmp); + cur = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_t, + cur_c, + idx, + model.layers[il].gpu_idx); + + cur = ggml_add(ctx0, cur, tmp); + offload_func(cur); + + cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); + offload_func(cur); + + // tmp = ggml_add(ctx0, + // tmp, + // model.layers[il].c_mlp_proj_b); + // offload_func(tmp); + // offload_debug(tmp); + + // cur = tmp; + } + + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + offload_func(inpL); + } + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL, hparams.eps); + offload_func_nr(inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_mul(ctx0, + inpL, + model.ln_f_g); + offload_func_nr(inpL); + inpL = ggml_add(ctx0, + inpL, + model.ln_f_b); + ggml_set_name(inpL, "before"); + offload_func_nr(inpL); + } + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + ggml_set_name(inpL, "last_layer"); +// offload_func(inpL); + + // logits -> probs + //inpL = ggml_soft_max(ctx0, inpL); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - model: the model +// - allocr: ggml_allocr to use to allocate the compute buffer +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt2_eval( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + // reset the allocator to free all the memory allocated during the previous inference + ggml_allocr_reset(allocr); + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); + + // allocate tensors + ggml_allocr_alloc_graph(allocr, gf); + +#ifdef GGML_USE_CUBLAS + for (int i = 0; i < gf->n_leafs; i++) { + ggml_tensor * node = gf->leafs[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } + + for (int i = 0; i < gf->n_nodes; i++) { + ggml_tensor * node = gf->nodes[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } +#endif + + + + // run the computation + struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); + static std::vector work_buffer; + work_buffer.resize(plan.work_size); + plan.work_data = work_buffer.data(); + ggml_graph_compute(gf, &plan); + + //if (n_past%100 == 0) { + // ggml_graph_print (gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + // in this case, the output tensor is the last one in the graph + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + gpt2_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_model_load(params.model, model, vocab, params)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, "hello world"); + } + printf("load finish\n"); + + // keep this buffer alive while evaluating the model + + struct ggml_allocr * allocr = NULL; + // allocate the compute buffer + { + allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); + + // create the worst case graph for memory usage estimation + int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); + int n_past = model.hparams.n_ctx - n_tokens; + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; + + // recreate the allocator with the required memory + ggml_allocr_free(allocr); + // compute_buffer.resize(mem_size); + compute_buffer = ggml_cuda_host_malloc(mem_size); + // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); + allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); + + fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); + for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { + printf("%d ", embd_inp[i]); + } + printf("\n\n"); + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + int cnt = 0; + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { + printf("Failed to predict\n"); + return 1; + } + cnt += 1; + + if (cnt > 0) + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + llama_sampling_params & sparams = params.sparams; + const int top_k = sparams.top_k; + const float top_p = sparams.top_p; + const float temp = sparams.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (int32_t(embd.size()) >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // end of text token + if (embd.back() == 50256) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/examples/gpt-2-sparse/main.cpp_123 b/examples/gpt-2-sparse/main.cpp_123 new file mode 100644 index 00000000..4deed1df --- /dev/null +++ b/examples/gpt-2-sparse/main.cpp_123 @@ -0,0 +1,1592 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-cuda.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif +typedef void (*offload_func_t)(struct ggml_tensor * tensor); +void opt_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} +// default hparams (GPT-2 117M) +struct gpt2_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; + float eps = 1e-5f; +}; + +struct gpt2_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + // struct ggml_tensor * c_attn_attn_w; + // struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_attn_q_w; + struct ggml_tensor * c_attn_attn_q_b; + + struct ggml_tensor * c_attn_attn_k_w; + struct ggml_tensor * c_attn_attn_k_b; + + struct ggml_tensor * c_attn_attn_v_w; + struct ggml_tensor * c_attn_attn_v_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; + + struct ggml_tensor * gpu_idx; + struct ggml_tensor * gpu_bucket; + // gpu heat + struct ggml_tensor * c_mlp_fc_w_gpu; + struct ggml_tensor * c_mlp_proj_w_t; + struct ggml_tensor * c_mlp_proj_w_gpu; + + //predictor + struct ggml_tensor * mlp_pre_w1_w; + struct ggml_tensor * mlp_pre_w2_w; +}; + +struct opt_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + opt_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error("opt_file fail\n"); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + ~opt_file() { + if (fp) { + std::fclose(fp); + } + } +}; +#define _POSIX_MAPPED_FILES +#include +#include + +struct opt_mmap { + void * addr; + size_t size; + + opt_mmap(const opt_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error("mmap failed\n"); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~opt_mmap() { + munmap(addr, size); + } +#else + static constexpr bool SUPPORTED = false; + + opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +struct gpt2_model { + gpt2_hparams hparams; + struct opt_file * file; + struct opt_mmap * mapping; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +struct ggml_context * ctx0 = nullptr; +// std::vector compute_buffer; +void *compute_buffer; + +bool endsWith(const std::string& str, const std::string& suffix) { + if (str.length() < suffix.length()) { + return false; + } + return str.substr(str.length() - suffix.length()) == suffix; +} + + +// load the model's weights from a file +bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + model.file = new opt_file(fname.c_str(), "rb"); + printf("size %d\n", model.file->size); + model.mapping = new opt_mmap(model.file, 0, false); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + /* int32_t n_vocab = 0; */ + /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ + + /* if (n_vocab != model.hparams.n_vocab) { */ + /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ + /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ + /* return false; */ + /* } */ + int32_t n_vocab = model.hparams.n_vocab; + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + printf("wtype %d\n", wtype); + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + //need refactor + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); + //predictor + ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size = 0; + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 12*n_layer)*51200; // object overhead + + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + int main_gpu = 0; +#if defined(GGML_USE_CUBLAS) + fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + ggml_cuda_set_main_device(main_gpu); +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT +#else +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU +#endif + + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; + // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; + + // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); + // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // model.lm_head->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["output_norm.weight"] = &model.ln_f_g; + model.tensors["output_norm.bias"] = &model.ln_f_b; + + model.tensors["tok_embeddings.weight"] = &model.wte; + model.tensors["pos_embeddings.weight"] = &model.wpe; + model.tensors["output.weight"] = &model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + memset(&layer, 0, sizeof(gpt2_layer)); + + // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + // // need refine + // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); + // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); + // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); + + // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); + // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); + + // if (i <= 10) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); + // } else if (i <= 12) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); + // } else if (i <= 18) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); + + // } else if (i <= 21) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); + // } else if (i <= 26) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); + // } else if (i <= 31) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); + // } + + // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; + + // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; + // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; + model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; + + model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; + model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; + model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; + model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; + model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; + model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; + + model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; + model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; + model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; + } + } + + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + #ifdef GGML_USE_CUBLAS + // ggml_cuda_assign_buffers_no_scratch(model.memory_k); + // ggml_cuda_assign_buffers_no_scratch(model.memory_v); + #endif + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + ggml_set_no_alloc(ctx, true); + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + const std::vector to_gpu = { + "output_norm.bias", + "output_norm.weight", + ".*attention.wq.weight", + ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + // ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + "layers.*output_norm.weight", + "layers.*output_norm.bias", + ".*fc1.weight", + ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_gpu_lv = { + // ".*attention.wq.weight", + // ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + // "layers.*output_norm.weight", + // "layers.*output_norm.bias", + ".*fc1.weight", + ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_lock = { + "tok_embeddings.weight", + "pos_embeddings.weight", + // "output_norm.bias", + ".*attention.wq.weight", + ".*attention.wq.bias", + // ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + // ".*attention.wo.bias", + ".*feed_forward.w1.weight", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_transpose", + // ".*feed_forward.w2.weight", + ".*feed_forward.w2.bias", + ".*gpu.weight", + ".*attention_norm.weight", + ".*attention_norm.bias", + ".*output_norm.weight", + ".*output_norm.bias", + ".*attention.*fc1.weight", + ".*attention.*fc1.bias", + ".*attention.*fc2.weight", + ".*attention.*fc2.bias", + // ".*w2.bias", + // ".*w1.bias", + "output.weight", + }; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + int64_t new_ne[2]; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + new_ne[i] = ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); + return false; + } + ggml_tensor ** ptr = model.tensors[name]; + // printf("name %s ptr %p\n", name.c_str(), *ptr); + // int k; + // scanf("%d", &k); + *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); + + auto tensor = (ggml_tensor *)*model.tensors[name]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + + + // for debugging + if (1) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + std::streampos offset = fin.tellg(); + // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + tensor->data = model.mapping->addr + static_cast(offset); + // if ( endsWith(name.c_str(), "weight_transpose")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i+4096]); + // } + // } + // printf("\n"); + // if (endsWith(name.c_str(), "weight_h20")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i]); + + // } + // int k; + // scanf("%d", &k); + // } + + // // GPT-2 models share the WTE tensor as the LM head + // if (name == "model/wte" && has_lm_head == false) { + // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + // } + + // if (name == "model/lm_head") { + // has_lm_head = true; + // } + if (model_params.low_vram == false) { + for (const auto &s : to_gpu) + { + // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight"))) + // { + // std::regex pattern(R"(\d+)"); + // std::smatch match; + // int layer_id = 0; + // if (std::regex_search(name, match, pattern)) + // { + // std::string digitStr = match.str(); + // int num = std::stoi(digitStr); + // layer_id = num; + // } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + // if (layer_id > model_params.n_gpu_layers) + // break; + // } + if (std::regex_search(name, std::regex(s))) + { + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + } else { + for (const auto &s : to_gpu_lv) + { + if (std::regex_search(name, std::regex(s))) + { + std::regex pattern(R"(\d+)"); + std::smatch match; + int layer_id = 0; + if (std::regex_search(name, match, pattern)) + { + std::string digitStr = match.str(); + int num = std::stoi(digitStr); + layer_id = num; + } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + if (layer_id > model_params.n_gpu_layers) + break; + // printf("name %s\n", name.c_str()); + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + + } + if (tensor->backend == GGML_BACKEND_GPU) { + #if defined(GGML_USE_CUBLAS) + ggml_cuda_transform_tensor(tensor->data, tensor); + #endif + } + for (const auto &s : to_lock) + { + if (std::regex_match(name, std::regex(s))) + { + if(!mlock(tensor->data, ggml_nbytes(tensor))) { + // printf("mlock %s\n", name.c_str()); + } + else { + printf("mlock failed %s\n", name.c_str()); + } + } + } + + total_size += ggml_nbytes(tensor); + } + ggml_set_no_alloc(ctx, false); + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + printf("load finish\n"); + // int k; + // scanf("%d", &k); + + fin.close(); + + return true; +} + +// build the computation graph +struct ggml_cgraph * gpt2_graph( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_past, + const std::vector & embd_inp) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + // static std::vector buf(buf_size); + static void * buf = ggml_cuda_host_malloc(buf_size); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + ctx0 = ggml_init(params); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, embd); + + // avoid writing to tensors if we are only measuring the memory usage + if (!ggml_allocr_is_measure(allocr)) { + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, position); + if (!ggml_allocr_is_measure(allocr)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i + 2; + } + } + offload_func_t offload_func = opt_nop; + offload_func_t offload_func_kq = opt_nop; + offload_func_t offload_func_v = opt_nop; + offload_func_t offload_func_nr = opt_nop; + offload_func_t offload_debug = opt_nop; +#ifdef GGML_USE_CUBLAS + offload_debug = ggml_cuda_assign_buffers_no_alloc; + // offload_func = ggml_cuda_assign_buffers_no_alloc; + // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + // offload_func_v = ggml_cuda_assign_buffers_no_alloc; + // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; +#endif + // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; + // int k; + // scanf("%d", &k); + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(allocr, KQ_scale); + if (!ggml_allocr_is_measure(allocr)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + ggml_set_name(inpL, "inpL_first"); + // offload_func(inpL); + + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL, hparams.eps); + offload_func(cur); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_1_g); + offload_func(cur); + ggml_set_name(cur, "ln_1_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_1_b); + ggml_set_name(cur, "ln_1_b"); + // offload_func(cur); + + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + + struct ggml_tensor *k_cpy = nullptr; + struct ggml_tensor *v_cpy = nullptr; + // self-attention + { + // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); + offload_func_kq(Qcur); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); + offload_func_kq(Kcur); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); + offload_func_kq(Kcur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); + offload_func_v(Vcur); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); + offload_func_v(Vcur); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + offload_func_v(Vcur); + + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + offload_func_kq(k); + // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); + + offload_func_v(v); + k_cpy = ggml_cpy(ctx0, Kcur, k); + offload_func_kq(k_cpy); + ggml_set_name(k_cpy, "k_cpy"); + v_cpy = ggml_cpy(ctx0, Vcur, v); + offload_func_v(v_cpy); + ggml_set_name(v_cpy, "v_cpy"); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); + offload_func_kq(Qcur); + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + offload_func_kq(Q); + + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + // struct ggml_tensor * K = + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, model.memory_k, + 128, n_past + N, n_head, + ggml_element_size(model.memory_k)*n_embd, + ggml_element_size(model.memory_k)*128, + ggml_element_size(model.memory_k)*n_embd*n_ctx*il); + K->src[1] = k_cpy; + offload_func_kq(K); + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + KQ_scale); + offload_func_kq(KQ_scaled); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, 128, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*128, + n_ctx*ggml_element_size(model.memory_k)*n_embd*il); + V->src[1] = v_cpy; + offload_func_v(V); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merge_cont"); + offload_func_v(cur); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + ggml_set_name(cur, "attn_proj"); + offload_func(cur); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_attn_proj_b); + ggml_set_name(cur, "attn_bias"); + offload_func(cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + ggml_set_name(cur, "after attn"); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + ggml_tensor *idx = nullptr; + ggml_tensor *idx_g = nullptr; + ggml_tensor *cur_c = nullptr; + + // norm + { + cur = ggml_norm(ctx0, inpFF, hparams.eps); + offload_func(cur); + ggml_set_name(cur, "norm_FFN"); + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_2_g); + offload_func(cur); + ggml_set_name(cur, "norm_FFN_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_2_b); + // offload_func(cur); + // ggml_set_name(cur, "norm_FFN_w"); + // cur_c = ggml_dup(ctx0, cur); + } + // if (N == 1) + if (1) + { + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w1_w, + inpFF); + offload_func(idx); + ggml_set_name(idx, "mlp_pre_w1"); + idx = ggml_relu(ctx0, idx); + offload_func(idx); + ggml_set_name(idx, "relu_pre"); + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w2_w, + idx); + ggml_set_name(idx, "mlp_pre_w2"); + // offload_func(idx); + // idx = ggml_sigmoid(ctx0, idx); + // offload_func(idx); + // idx_g = idx; + // idx = ggml_dup(ctx0, idx_g); + // ggml_set_name(idx, "idx_cpu_dup"); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + if (N >= 80) + // if (0) + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + offload_debug(cur); + offload_func(cur); + ggml_set_name(cur, "up_ffn"); + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_fc_b); + offload_debug(cur); + offload_func(cur); + } + else + { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_fc_w, + // cur); + // offload_func(cur); + // cur = ggml_add(ctx0, + // cur, + // model.layers[il].c_mlp_fc_b); + // offload_func(cur); + + + struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, + model.layers[il].c_mlp_fc_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "mlp_up_gpu"); + offload_func(tmp); + offload_debug(tmp); + cur = ggml_mul_mat_idx(ctx0, + model.layers[il].c_mlp_fc_w, + cur, + idx, + model.layers[il].gpu_idx); + ggml_set_name(cur, "mlp_up_cpu"); + cur = ggml_add_idx(ctx0, + cur, + model.layers[il].c_mlp_fc_b, + idx); + ggml_set_name(tmp, "mlp_up_bias"); + offload_debug(tmp); + offload_func(tmp); + + cur = ggml_add(ctx0, cur, tmp); + ggml_set_name(cur, "mlp_up_mix"); + offload_func(cur); + + // cur = tmp; + + } + + + + // GELU activation + // [3072, N] + cur = ggml_relu(ctx0, cur); + // cur_c = cur; + // offload_func(cur); + cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + if (N >= 80) { + // if (0) { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_proj_w, + // cur); + cur = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_t, + cur, + NULL, + NULL); + offload_debug(cur); + offload_func(cur); + ggml_set_name(cur, "down_ffn"); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_proj_b); + offload_func(cur); + offload_debug(cur); + } + else { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_proj_w, + // cur); + // offload_func(cur); + + // cur = ggml_axpy(ctx0, + // model.layers[il].c_mlp_proj_w_t, + // cur, + // NULL, + // NULL); + // offload_func(cur); + + + // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, + // model.layers[il].c_mlp_proj_w_gpu, + // cur, + // model.layers[il].gpu_bucket, + // NULL); + struct ggml_tensor *tmp = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "axpy"); + offload_func(tmp); + offload_debug(tmp); + cur = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_t, + cur_c, + idx, + model.layers[il].gpu_idx); + + cur = ggml_add(ctx0, cur, tmp); + offload_func(cur); + + cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); + offload_func(cur); + + // tmp = ggml_add(ctx0, + // tmp, + // model.layers[il].c_mlp_proj_b); + // offload_func(tmp); + // offload_debug(tmp); + + // cur = tmp; + } + + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + offload_func(inpL); + } + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL, hparams.eps); + offload_func_nr(inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_mul(ctx0, + inpL, + model.ln_f_g); + offload_func_nr(inpL); + inpL = ggml_add(ctx0, + inpL, + model.ln_f_b); + ggml_set_name(inpL, "before"); + offload_func_nr(inpL); + } + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + ggml_set_name(inpL, "last_layer"); +// offload_func(inpL); + + // logits -> probs + //inpL = ggml_soft_max(ctx0, inpL); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - model: the model +// - allocr: ggml_allocr to use to allocate the compute buffer +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt2_eval( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + // reset the allocator to free all the memory allocated during the previous inference + ggml_allocr_reset(allocr); + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); + + // allocate tensors + ggml_allocr_alloc_graph(allocr, gf); + +#ifdef GGML_USE_CUBLAS + for (int i = 0; i < gf->n_leafs; i++) { + ggml_tensor * node = gf->leafs[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } + + for (int i = 0; i < gf->n_nodes; i++) { + ggml_tensor * node = gf->nodes[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } +#endif + + + + // run the computation + struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); + static std::vector work_buffer; + work_buffer.resize(plan.work_size); + plan.work_data = work_buffer.data(); + ggml_graph_compute(gf, &plan); + + //if (n_past%100 == 0) { + // ggml_graph_print (gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + // in this case, the output tensor is the last one in the graph + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + gpt2_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_model_load(params.model, model, vocab, params)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, "hello world"); + } + printf("load finish\n"); + + // keep this buffer alive while evaluating the model + + struct ggml_allocr * allocr = NULL; + // allocate the compute buffer + { + allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); + + // create the worst case graph for memory usage estimation + int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); + int n_past = model.hparams.n_ctx - n_tokens; + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; + + // recreate the allocator with the required memory + ggml_allocr_free(allocr); + // compute_buffer.resize(mem_size); + compute_buffer = ggml_cuda_host_malloc(mem_size); + // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); + allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); + + fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); + for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { + printf("%d ", embd_inp[i]); + } + printf("\n\n"); + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + int cnt = 0; + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { + printf("Failed to predict\n"); + return 1; + } + cnt += 1; + + if (cnt > 0) + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + const int top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (int32_t(embd.size()) >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // end of text token + if (embd.back() == 50256) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/examples/gpt-2-sparse/main.cpp_bak b/examples/gpt-2-sparse/main.cpp_bak new file mode 100644 index 00000000..e1e9d58e --- /dev/null +++ b/examples/gpt-2-sparse/main.cpp_bak @@ -0,0 +1,1546 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-cuda.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif +typedef void (*offload_func_t)(struct ggml_tensor * tensor); +void opt_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} +// default hparams (GPT-2 117M) +struct gpt2_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; + float eps = 1e-5f; +}; + +struct gpt2_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + // struct ggml_tensor * c_attn_attn_w; + // struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_attn_q_w; + struct ggml_tensor * c_attn_attn_q_b; + + struct ggml_tensor * c_attn_attn_k_w; + struct ggml_tensor * c_attn_attn_k_b; + + struct ggml_tensor * c_attn_attn_v_w; + struct ggml_tensor * c_attn_attn_v_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; + + struct ggml_tensor * gpu_idx; + struct ggml_tensor * gpu_bucket; + // gpu heat + struct ggml_tensor * c_mlp_fc_w_gpu; + struct ggml_tensor * c_mlp_proj_w_t; + struct ggml_tensor * c_mlp_proj_w_gpu; + + //predictor + struct ggml_tensor * mlp_pre_w1_w; + struct ggml_tensor * mlp_pre_w2_w; +}; + +struct opt_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + opt_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error("opt_file fail\n"); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + ~opt_file() { + if (fp) { + std::fclose(fp); + } + } +}; +#define _POSIX_MAPPED_FILES +#include +#include + +struct opt_mmap { + void * addr; + size_t size; + + opt_mmap(const opt_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error("mmap failed\n"); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~opt_mmap() { + munmap(addr, size); + } +#else + static constexpr bool SUPPORTED = false; + + opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +struct gpt2_model { + gpt2_hparams hparams; + struct opt_file * file; + struct opt_mmap * mapping; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +struct ggml_context * ctx0 = nullptr; +// std::vector compute_buffer; +void *compute_buffer; + +// load the model's weights from a file +bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + model.file = new opt_file(fname.c_str(), "rb"); + printf("size %d\n", model.file->size); + model.mapping = new opt_mmap(model.file, 0, false); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + /* int32_t n_vocab = 0; */ + /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ + + /* if (n_vocab != model.hparams.n_vocab) { */ + /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ + /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ + /* return false; */ + /* } */ + int32_t n_vocab = model.hparams.n_vocab; + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + printf("wtype %d\n", wtype); + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + //need refactor + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); + //predictor + ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size = 0; + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 12*n_layer)*51200; // object overhead + + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + int main_gpu = 0; +#if defined(GGML_USE_CUBLAS) + fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + ggml_cuda_set_main_device(main_gpu); +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT +#else +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU +#endif + + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; + // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; + + // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); + // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // model.lm_head->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["output_norm.weight"] = &model.ln_f_g; + model.tensors["output_norm.bias"] = &model.ln_f_b; + + model.tensors["tok_embeddings.weight"] = &model.wte; + model.tensors["pos_embeddings.weight"] = &model.wpe; + model.tensors["output.weight"] = &model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + memset(&layer, 0, sizeof(gpt2_layer)); + + // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + // // need refine + // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); + // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); + // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); + + // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); + // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); + + // if (i <= 10) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); + // } else if (i <= 12) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); + // } else if (i <= 18) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); + + // } else if (i <= 21) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); + // } else if (i <= 26) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); + // } else if (i <= 31) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); + // } + + // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; + + // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; + // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; + model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; + + model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; + model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; + model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; + model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; + model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; + model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; + + model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; + model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; + model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; + } + } + + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + #ifdef GGML_USE_CUBLAS + // ggml_cuda_assign_buffers_no_scratch(model.memory_k); + // ggml_cuda_assign_buffers_no_scratch(model.memory_v); + #endif + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + ggml_set_no_alloc(ctx, true); + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + const std::vector to_gpu = { + "output_norm.bias", + "output_norm.weight", + // ".*attention.wq.weight", + // ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + // ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + // ".*feed_forward.w1.weight$", + ".*feed_forward.w1.bias", + // ".*feed_forward.w2.weight_h20$", + ".*feed_forward.w2.weight_transpose", + // ".*feed_forward.w2.weight$", + /* ".*feed_forward.w2.weight$", */ + ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + "layers.*output_norm.weight", + "layers.*output_norm.bias", + ".*fc1.weight", + ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_gpu_lv = { + ".*attention.wq.weight", + ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + // "layers.*output_norm.weight", + // "layers.*output_norm.bias", + // ".*fc1.weight", + // ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_lock = { + "tok_embeddings.weight", + "pos_embeddings.weight", + // "output_norm.bias", + ".*attention.wq.weight", + ".*attention.wq.bias", + // ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + // ".*attention.wo.bias", + ".*feed_forward.w1.weight", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_transpose", + // ".*feed_forward.w2.weight", + ".*feed_forward.w2.bias", + ".*gpu.weight", + ".*attention_norm.weight", + ".*attention_norm.bias", + ".*output_norm.weight", + ".*output_norm.bias", + ".*attention.*fc1.weight", + ".*attention.*fc1.bias", + ".*attention.*fc2.weight", + ".*attention.*fc2.bias", + // ".*w2.bias", + // ".*w1.bias", + "output.weight", + }; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + int64_t new_ne[2]; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + new_ne[i] = ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); + return false; + } + ggml_tensor ** ptr = model.tensors[name]; + // printf("name %s ptr %p\n", name.c_str(), *ptr); + // int k; + // scanf("%d", &k); + *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); + + auto tensor = (ggml_tensor *)*model.tensors[name]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + std::streampos offset = fin.tellg(); + // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + tensor->data = model.mapping->addr + static_cast(offset); + + // // GPT-2 models share the WTE tensor as the LM head + // if (name == "model/wte" && has_lm_head == false) { + // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + // } + + // if (name == "model/lm_head") { + // has_lm_head = true; + // } + if (model_params.low_vram == false) { + for (const auto &s : to_gpu) + { + if (std::regex_search(name, std::regex(s))) + { + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + } else { + for (const auto &s : to_gpu_lv) + { + if (std::regex_search(name, std::regex(s))) + { + std::regex pattern(R"(\d+)"); + std::smatch match; + int layer_id = 0; + if (std::regex_search(name, match, pattern)) + { + std::string digitStr = match.str(); + int num = std::stoi(digitStr); + layer_id = num; + } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + if (layer_id > model_params.n_gpu_layers) + break; + // printf("name %s\n", name.c_str()); + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + + } + if (tensor->backend == GGML_BACKEND_GPU) { + #if defined(GGML_USE_CUBLAS) + ggml_cuda_transform_tensor(tensor->data, tensor); + #endif + } + for (const auto &s : to_lock) + { + if (std::regex_match(name, std::regex(s))) + { + if(!mlock(tensor->data, ggml_nbytes(tensor))) { + // printf("mlock %s\n", name.c_str()); + } + else { + printf("mlock failed %s\n", name.c_str()); + } + } + } + + total_size += ggml_nbytes(tensor); + } + ggml_set_no_alloc(ctx, false); + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + + return true; +} + +// build the computation graph +struct ggml_cgraph * gpt2_graph( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_past, + const std::vector & embd_inp) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + // static std::vector buf(buf_size); + static void * buf = ggml_cuda_host_malloc(buf_size); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + ctx0 = ggml_init(params); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, embd); + + // avoid writing to tensors if we are only measuring the memory usage + if (!ggml_allocr_is_measure(allocr)) { + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, position); + if (!ggml_allocr_is_measure(allocr)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i + 2; + } + } + offload_func_t offload_func = opt_nop; + offload_func_t offload_func_kq = opt_nop; + offload_func_t offload_func_v = opt_nop; + offload_func_t offload_func_nr = opt_nop; + offload_func_t offload_debug = opt_nop; +#ifdef GGML_USE_CUBLAS + // offload_debug = ggml_cuda_assign_buffers_no_alloc; + // offload_func = ggml_cuda_assign_buffers_no_alloc; + // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + // offload_func_v = ggml_cuda_assign_buffers_no_alloc; + // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; +#endif + // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; + // int k; + // scanf("%d", &k); + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(allocr, KQ_scale); + if (!ggml_allocr_is_measure(allocr)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + ggml_set_name(inpL, "inpL_first"); + // offload_func(inpL); + + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL, hparams.eps); + offload_func(cur); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_1_g); + offload_func(cur); + ggml_set_name(cur, "ln_1_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_1_b); + ggml_set_name(cur, "ln_1_b"); + // offload_func(cur); + + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + + struct ggml_tensor *k_cpy = nullptr; + struct ggml_tensor *v_cpy = nullptr; + // self-attention + { + // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); + offload_func_kq(Qcur); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); + offload_func_kq(Kcur); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); + offload_func_kq(Kcur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); + offload_func_v(Vcur); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); + offload_func_v(Vcur); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + offload_func_v(Vcur); + + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + offload_func_kq(k); + // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); + + offload_func_v(v); + k_cpy = ggml_cpy(ctx0, Kcur, k); + offload_func_kq(k_cpy); + ggml_set_name(k_cpy, "k_cpy"); + v_cpy = ggml_cpy(ctx0, Vcur, v); + offload_func_v(v_cpy); + ggml_set_name(v_cpy, "v_cpy"); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); + offload_func_kq(Qcur); + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + offload_func_kq(Q); + + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + // struct ggml_tensor * K = + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, model.memory_k, + 128, n_past + N, n_head, + ggml_element_size(model.memory_k)*n_embd, + ggml_element_size(model.memory_k)*128, + ggml_element_size(model.memory_k)*n_embd*n_ctx*il); + K->src[1] = k_cpy; + offload_func_kq(K); + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + KQ_scale); + offload_func_kq(KQ_scaled); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, 128, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*128, + n_ctx*ggml_element_size(model.memory_k)*n_embd*il); + V->src[1] = v_cpy; + offload_func_v(V); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merge_cont"); + offload_func_v(cur); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + ggml_set_name(cur, "attn_proj"); + offload_func(cur); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_attn_proj_b); + ggml_set_name(cur, "attn_bias"); + offload_func(cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + ggml_set_name(cur, "after attn"); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + ggml_tensor *idx = nullptr; + ggml_tensor *idx_g = nullptr; + ggml_tensor *cur_c = nullptr; + + // norm + { + cur = ggml_norm(ctx0, inpFF, hparams.eps); + offload_func(cur); + ggml_set_name(cur, "norm_FFN"); + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_2_g); + offload_func(cur); + ggml_set_name(cur, "norm_FFN_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_2_b); + // offload_func(cur); + // ggml_set_name(cur, "norm_FFN_w"); + // cur_c = ggml_dup(ctx0, cur); + } + // if (N == 1) + if (1) + { + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w1_w, + inpFF); + offload_func(idx); + ggml_set_name(idx, "mlp_pre_w1"); + idx = ggml_relu(ctx0, idx); + offload_func(idx); + ggml_set_name(idx, "relu_pre"); + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w2_w, + idx); + ggml_set_name(idx, "mlp_pre_w2"); + // offload_func(idx); + // idx = ggml_sigmoid(ctx0, idx); + // offload_func(idx); + // idx_g = idx; + // idx = ggml_dup(ctx0, idx_g); + // ggml_set_name(idx, "idx_cpu_dup"); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + if (N != 1) + // if (0) + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + offload_func(cur); + ggml_set_name(cur, "up_ffn"); + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_fc_b); + offload_func(cur); + } + else + { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_fc_w, + // cur); + // offload_func(cur); + // cur = ggml_add(ctx0, + // cur, + // model.layers[il].c_mlp_fc_b); + // offload_func(cur); + + + struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, + model.layers[il].c_mlp_fc_w_gpu, + // model.layers[il].c_mlp_fc_w, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "mlp_up_gpu"); + offload_func(tmp); + offload_debug(tmp); + cur = ggml_mul_mat_idx(ctx0, + model.layers[il].c_mlp_fc_w, + cur, + idx, + model.layers[il].gpu_idx); + ggml_set_name(cur, "mlp_up_cpu"); + + // cur = ggml_add_idx(ctx0, + // cur, + // model.layers[il].c_mlp_fc_b, + // idx); + // offload_func(cur); + tmp = ggml_add_idx(ctx0, + tmp, + model.layers[il].c_mlp_fc_b, + idx); + offload_debug(tmp); + + + // cur = ggml_add(ctx0, cur, tmp); + // ggml_set_name(cur, "mlp_up_mix"); + // offload_func(cur); + + // cur = tmp; + + } + + + + // GELU activation + // [3072, N] + cur = ggml_relu(ctx0, cur); + // cur_c = cur; + // offload_func(cur); + cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + // if (N != 1) { + if (0) { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + offload_func(cur); + ggml_set_name(cur, "down_ffn"); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_proj_b); + offload_func(cur); + } + else { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_proj_w, + // cur); + // offload_func(cur); + + // cur = ggml_axpy(ctx0, + // model.layers[il].c_mlp_proj_w_t, + // cur, + // NULL, + // NULL); + // offload_func(cur); + + + //here + // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, + // model.layers[il].c_mlp_proj_w_gpu, + // cur, + // model.layers[il].gpu_bucket, + // NULL); + // ggml_set_name(tmp, "axpy"); + // offload_func(tmp); + // offload_debug(tmp); + cur = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_t, + cur_c, + // NULL, + // NULL); + idx, + model.layers[il].gpu_bucket); + // model.layers[il].gpu_idx); + // offload_func(cur); + + cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); + offload_func(cur); + + // tmp = ggml_add(ctx0, + // tmp, + // model.layers[il].c_mlp_proj_b); + // offload_func(tmp); + // offload_debug(tmp); + + // cur = ggml_add(ctx0, cur, tmp); + // offload_func(cur); + } + + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + offload_func(inpL); + } + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL, hparams.eps); + offload_func_nr(inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_mul(ctx0, + inpL, + model.ln_f_g); + offload_func_nr(inpL); + inpL = ggml_add(ctx0, + inpL, + model.ln_f_b); + ggml_set_name(inpL, "before"); + offload_func_nr(inpL); + } + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + ggml_set_name(inpL, "last_layer"); +// offload_func(inpL); + + // logits -> probs + //inpL = ggml_soft_max(ctx0, inpL); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - model: the model +// - allocr: ggml_allocr to use to allocate the compute buffer +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt2_eval( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + // reset the allocator to free all the memory allocated during the previous inference + ggml_allocr_reset(allocr); + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); + + // allocate tensors + ggml_allocr_alloc_graph(allocr, gf); + +#ifdef GGML_USE_CUBLAS + for (int i = 0; i < gf->n_leafs; i++) { + ggml_tensor * node = gf->leafs[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } + + for (int i = 0; i < gf->n_nodes; i++) { + ggml_tensor * node = gf->nodes[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } +#endif + + + + // run the computation + struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); + static std::vector work_buffer; + work_buffer.resize(plan.work_size); + plan.work_data = work_buffer.data(); + ggml_graph_compute(gf, &plan); + + //if (n_past%100 == 0) { + // ggml_graph_print (gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + // in this case, the output tensor is the last one in the graph + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + gpt2_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_model_load(params.model, model, vocab, params)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, "hello world"); + } + printf("load finish\n"); + + // keep this buffer alive while evaluating the model + + struct ggml_allocr * allocr = NULL; + // allocate the compute buffer + { + allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); + + // create the worst case graph for memory usage estimation + int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); + int n_past = model.hparams.n_ctx - n_tokens; + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; + + // recreate the allocator with the required memory + ggml_allocr_free(allocr); + // compute_buffer.resize(mem_size); + compute_buffer = ggml_cuda_host_malloc(mem_size); + // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); + allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); + + fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); + for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { + printf("%d ", embd_inp[i]); + } + printf("\n\n"); + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + int cnt = 0; + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { + printf("Failed to predict\n"); + return 1; + } + cnt += 1; + + if (cnt > 0) + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + const int top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (int32_t(embd.size()) >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // end of text token + if (embd.back() == 50256) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/examples/gpt-2-sparse/main13b.cpp b/examples/gpt-2-sparse/main13b.cpp new file mode 100644 index 00000000..0681da3e --- /dev/null +++ b/examples/gpt-2-sparse/main13b.cpp @@ -0,0 +1,1583 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-cuda.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif +typedef void (*offload_func_t)(struct ggml_tensor * tensor); +void opt_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} +// default hparams (GPT-2 117M) +struct gpt2_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; + float eps = 1e-5f; +}; + +struct gpt2_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + // struct ggml_tensor * c_attn_attn_w; + // struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_attn_q_w; + struct ggml_tensor * c_attn_attn_q_b; + + struct ggml_tensor * c_attn_attn_k_w; + struct ggml_tensor * c_attn_attn_k_b; + + struct ggml_tensor * c_attn_attn_v_w; + struct ggml_tensor * c_attn_attn_v_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; + + struct ggml_tensor * gpu_idx; + struct ggml_tensor * gpu_bucket; + // gpu heat + struct ggml_tensor * c_mlp_fc_w_gpu; + struct ggml_tensor * c_mlp_proj_w_t; + struct ggml_tensor * c_mlp_proj_w_gpu; + + //predictor + struct ggml_tensor * mlp_pre_w1_w; + struct ggml_tensor * mlp_pre_w2_w; +}; + +struct opt_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + opt_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error("opt_file fail\n"); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + ~opt_file() { + if (fp) { + std::fclose(fp); + } + } +}; +#define _POSIX_MAPPED_FILES +#include +#include + +struct opt_mmap { + void * addr; + size_t size; + + opt_mmap(const opt_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error("mmap failed\n"); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~opt_mmap() { + munmap(addr, size); + } +#else + static constexpr bool SUPPORTED = false; + + opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +struct gpt2_model { + gpt2_hparams hparams; + struct opt_file * file; + struct opt_mmap * mapping; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +struct ggml_context * ctx0 = nullptr; +// std::vector compute_buffer; +void *compute_buffer; + +bool endsWith(const std::string& str, const std::string& suffix) { + if (str.length() < suffix.length()) { + return false; + } + return str.substr(str.length() - suffix.length()) == suffix; +} + + +// load the model's weights from a file +bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + model.file = new opt_file(fname.c_str(), "rb"); + printf("size %d\n", model.file->size); + model.mapping = new opt_mmap(model.file, 0, false); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + /* int32_t n_vocab = 0; */ + /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ + + /* if (n_vocab != model.hparams.n_vocab) { */ + /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ + /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ + /* return false; */ + /* } */ + int32_t n_vocab = model.hparams.n_vocab; + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + printf("wtype %d\n", wtype); + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + //need refactor + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); + //predictor + ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size = 0; + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 12*n_layer)*51200; // object overhead + + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + int main_gpu = 0; +#if defined(GGML_USE_CUBLAS) + fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + ggml_cuda_set_main_device(main_gpu); +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT +#else +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU +#endif + + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; + // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; + + // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); + // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // model.lm_head->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["output_norm.weight"] = &model.ln_f_g; + model.tensors["output_norm.bias"] = &model.ln_f_b; + + model.tensors["tok_embeddings.weight"] = &model.wte; + model.tensors["pos_embeddings.weight"] = &model.wpe; + model.tensors["output.weight"] = &model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + memset(&layer, 0, sizeof(gpt2_layer)); + + // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + // // need refine + // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); + // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); + // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); + + // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); + // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); + + // if (i <= 10) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); + // } else if (i <= 12) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); + // } else if (i <= 18) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); + + // } else if (i <= 21) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); + // } else if (i <= 26) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); + // } else if (i <= 31) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); + // } + + // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; + + // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; + // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; + model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; + + model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; + model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; + model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; + model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; + model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; + model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; + + model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; + model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; + model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; + } + } + + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + #ifdef GGML_USE_CUBLAS + // ggml_cuda_assign_buffers_no_scratch(model.memory_k); + // ggml_cuda_assign_buffers_no_scratch(model.memory_v); + #endif + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + ggml_set_no_alloc(ctx, true); + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + const std::vector to_gpu = { + "output_norm.bias", + "output_norm.weight", + ".*attention.wq.weight", + ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + // ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + "layers.*output_norm.weight", + "layers.*output_norm.bias", + ".*fc1.weight", + ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_gpu_lv = { + ".*attention.wq.weight", + ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + // "layers.*output_norm.weight", + // "layers.*output_norm.bias", + // ".*fc1.weight", + // ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_lock = { + "tok_embeddings.weight", + "pos_embeddings.weight", + // "output_norm.bias", + ".*attention.wq.weight", + ".*attention.wq.bias", + // ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + // ".*attention.wo.bias", + ".*feed_forward.w1.weight", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_transpose", + // ".*feed_forward.w2.weight", + ".*feed_forward.w2.bias", + ".*gpu.weight", + ".*attention_norm.weight", + ".*attention_norm.bias", + ".*output_norm.weight", + ".*output_norm.bias", + ".*attention.*fc1.weight", + ".*attention.*fc1.bias", + ".*attention.*fc2.weight", + ".*attention.*fc2.bias", + // ".*w2.bias", + // ".*w1.bias", + "output.weight", + }; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + int64_t new_ne[2]; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + new_ne[i] = ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); + return false; + } + ggml_tensor ** ptr = model.tensors[name]; + // printf("name %s ptr %p\n", name.c_str(), *ptr); + // int k; + // scanf("%d", &k); + *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); + + auto tensor = (ggml_tensor *)*model.tensors[name]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + std::streampos offset = fin.tellg(); + // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + tensor->data = model.mapping->addr + static_cast(offset); + // if ( endsWith(name.c_str(), "weight_transpose")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i+4096]); + // } + // } + // printf("\n"); + // if (endsWith(name.c_str(), "weight_h20")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i]); + + // } + // int k; + // scanf("%d", &k); + // } + + // // GPT-2 models share the WTE tensor as the LM head + // if (name == "model/wte" && has_lm_head == false) { + // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + // } + + // if (name == "model/lm_head") { + // has_lm_head = true; + // } + if (model_params.low_vram == false) { + for (const auto &s : to_gpu) + { + // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight"))) + // { + // std::regex pattern(R"(\d+)"); + // std::smatch match; + // int layer_id = 0; + // if (std::regex_search(name, match, pattern)) + // { + // std::string digitStr = match.str(); + // int num = std::stoi(digitStr); + // layer_id = num; + // } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + // if (layer_id > model_params.n_gpu_layers) + // break; + // } + // printf("name %s\n", name.c_str()); + if (std::regex_search(name, std::regex(s))) + { + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + } else { + for (const auto &s : to_gpu_lv) + { + if (std::regex_search(name, std::regex(s))) + { + std::regex pattern(R"(\d+)"); + std::smatch match; + int layer_id = 0; + if (std::regex_search(name, match, pattern)) + { + std::string digitStr = match.str(); + int num = std::stoi(digitStr); + layer_id = num; + } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + if (layer_id > model_params.n_gpu_layers) + break; + // printf("name %s\n", name.c_str()); + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + + } + if (tensor->backend == GGML_BACKEND_GPU) { + #if defined(GGML_USE_CUBLAS) + ggml_cuda_transform_tensor(tensor->data, tensor); + #endif + } + for (const auto &s : to_lock) + { + if (std::regex_match(name, std::regex(s))) + { + if(!mlock(tensor->data, ggml_nbytes(tensor))) { + // printf("mlock %s\n", name.c_str()); + } + else { + printf("mlock failed %s\n", name.c_str()); + } + } + } + + total_size += ggml_nbytes(tensor); + } + ggml_set_no_alloc(ctx, false); + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + + return true; +} + +// build the computation graph +struct ggml_cgraph * gpt2_graph( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_past, + const std::vector & embd_inp) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + // static std::vector buf(buf_size); + static void * buf = ggml_cuda_host_malloc(buf_size); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + ctx0 = ggml_init(params); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, embd); + + // avoid writing to tensors if we are only measuring the memory usage + if (!ggml_allocr_is_measure(allocr)) { + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, position); + if (!ggml_allocr_is_measure(allocr)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i + 2; + } + } + offload_func_t offload_func = opt_nop; + offload_func_t offload_func_kq = opt_nop; + offload_func_t offload_func_v = opt_nop; + offload_func_t offload_func_nr = opt_nop; + offload_func_t offload_debug = opt_nop; +#ifdef GGML_USE_CUBLAS + offload_debug = ggml_cuda_assign_buffers_no_alloc; + // offload_func = ggml_cuda_assign_buffers_no_alloc; + // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + // offload_func_v = ggml_cuda_assign_buffers_no_alloc; + // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; +#endif + // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; + // int k; + // scanf("%d", &k); + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(allocr, KQ_scale); + if (!ggml_allocr_is_measure(allocr)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + ggml_set_name(inpL, "inpL_first"); + // offload_func(inpL); + + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL, hparams.eps); + offload_func(cur); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_1_g); + offload_func(cur); + ggml_set_name(cur, "ln_1_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_1_b); + ggml_set_name(cur, "ln_1_b"); + // offload_func(cur); + + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + + struct ggml_tensor *k_cpy = nullptr; + struct ggml_tensor *v_cpy = nullptr; + // self-attention + { + // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); + offload_func_kq(Qcur); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); + offload_func_kq(Kcur); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); + offload_func_kq(Kcur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); + offload_func_v(Vcur); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); + offload_func_v(Vcur); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + offload_func_v(Vcur); + + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + offload_func_kq(k); + // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); + + offload_func_v(v); + k_cpy = ggml_cpy(ctx0, Kcur, k); + offload_func_kq(k_cpy); + ggml_set_name(k_cpy, "k_cpy"); + v_cpy = ggml_cpy(ctx0, Vcur, v); + offload_func_v(v_cpy); + ggml_set_name(v_cpy, "v_cpy"); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); + offload_func_kq(Qcur); + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + offload_func_kq(Q); + + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + // struct ggml_tensor * K = + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, model.memory_k, + 128, n_past + N, n_head, + ggml_element_size(model.memory_k)*n_embd, + ggml_element_size(model.memory_k)*128, + ggml_element_size(model.memory_k)*n_embd*n_ctx*il); + K->src[1] = k_cpy; + offload_func_kq(K); + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + KQ_scale); + offload_func_kq(KQ_scaled); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, 128, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*128, + n_ctx*ggml_element_size(model.memory_k)*n_embd*il); + V->src[1] = v_cpy; + offload_func_v(V); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merge_cont"); + offload_func_v(cur); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + ggml_set_name(cur, "attn_proj"); + offload_func(cur); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_attn_proj_b); + ggml_set_name(cur, "attn_bias"); + offload_func(cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + ggml_set_name(cur, "after attn"); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + ggml_tensor *idx = nullptr; + ggml_tensor *idx_g = nullptr; + ggml_tensor *cur_c = nullptr; + + // norm + { + cur = ggml_norm(ctx0, inpFF, hparams.eps); + offload_func(cur); + ggml_set_name(cur, "norm_FFN"); + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_2_g); + offload_func(cur); + ggml_set_name(cur, "norm_FFN_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_2_b); + // offload_func(cur); + // ggml_set_name(cur, "norm_FFN_w"); + // cur_c = ggml_dup(ctx0, cur); + } + // if (N == 1) + if (1) + { + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w1_w, + cur); + offload_func(idx); + ggml_set_name(idx, "mlp_pre_w1"); + idx = ggml_relu(ctx0, idx); + offload_func(idx); + ggml_set_name(idx, "relu_pre"); + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w2_w, + idx); + ggml_set_name(idx, "mlp_pre_w2"); + // offload_func(idx); + // idx = ggml_sigmoid(ctx0, idx); + // offload_func(idx); + // idx_g = idx; + // idx = ggml_dup(ctx0, idx_g); + // ggml_set_name(idx, "idx_cpu_dup"); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + // if (N != 1) + if (0) + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + offload_func(cur); + ggml_set_name(cur, "up_ffn"); + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_fc_b); + offload_func(cur); + } + else + { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_fc_w, + // cur); + // offload_func(cur); + // cur = ggml_add(ctx0, + // cur, + // model.layers[il].c_mlp_fc_b); + // offload_func(cur); + + + struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, + model.layers[il].c_mlp_fc_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "mlp_up_gpu"); + offload_func(tmp); + offload_debug(tmp); + cur = ggml_mul_mat_idx(ctx0, + model.layers[il].c_mlp_fc_w, + cur, + idx, + model.layers[il].gpu_idx); + ggml_set_name(cur, "mlp_up_cpu"); + tmp = ggml_add_idx(ctx0, + tmp, + model.layers[il].c_mlp_fc_b, + idx); + ggml_set_name(tmp, "mlp_up_bias"); + offload_debug(tmp); + offload_func(tmp); + + cur = ggml_add(ctx0, cur, tmp); + ggml_set_name(cur, "mlp_up_mix"); + offload_func(cur); + + // cur = tmp; + + } + + + + // GELU activation + // [3072, N] + cur = ggml_relu(ctx0, cur); + // cur_c = cur; + // offload_func(cur); + cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + // if (N != 1) { + if (0) { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + offload_func(cur); + ggml_set_name(cur, "down_ffn"); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_proj_b); + offload_func(cur); + } + else { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_proj_w, + // cur); + // offload_func(cur); + + // cur = ggml_axpy(ctx0, + // model.layers[il].c_mlp_proj_w_t, + // cur, + // NULL, + // NULL); + // offload_func(cur); + + + // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, + // model.layers[il].c_mlp_proj_w_gpu, + // cur, + // model.layers[il].gpu_bucket, + // NULL); + struct ggml_tensor *tmp = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "axpy"); + offload_func(tmp); + offload_debug(tmp); + + cur = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_t, + cur_c, + idx, + model.layers[il].gpu_idx); + + cur = ggml_add(ctx0, cur, tmp); + offload_func(cur); + + cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); + offload_func(cur); + + // tmp = ggml_add(ctx0, + // tmp, + // model.layers[il].c_mlp_proj_b); + // offload_func(tmp); + // offload_debug(tmp); + + // cur = tmp; + } + + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + offload_func(inpL); + } + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL, hparams.eps); + offload_func_nr(inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_mul(ctx0, + inpL, + model.ln_f_g); + offload_func_nr(inpL); + inpL = ggml_add(ctx0, + inpL, + model.ln_f_b); + ggml_set_name(inpL, "before"); + offload_func_nr(inpL); + } + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + ggml_set_name(inpL, "last_layer"); +// offload_func(inpL); + + // logits -> probs + //inpL = ggml_soft_max(ctx0, inpL); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - model: the model +// - allocr: ggml_allocr to use to allocate the compute buffer +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt2_eval( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + // reset the allocator to free all the memory allocated during the previous inference + ggml_allocr_reset(allocr); + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); + + // allocate tensors + ggml_allocr_alloc_graph(allocr, gf); + +#ifdef GGML_USE_CUBLAS + for (int i = 0; i < gf->n_leafs; i++) { + ggml_tensor * node = gf->leafs[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } + + for (int i = 0; i < gf->n_nodes; i++) { + ggml_tensor * node = gf->nodes[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } +#endif + + + + // run the computation + struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); + static std::vector work_buffer; + work_buffer.resize(plan.work_size); + plan.work_data = work_buffer.data(); + ggml_graph_compute(gf, &plan); + + //if (n_past%100 == 0) { + // ggml_graph_print (gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + // in this case, the output tensor is the last one in the graph + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + gpt2_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_model_load(params.model, model, vocab, params)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, "hello world"); + } + printf("load finish\n"); + + // keep this buffer alive while evaluating the model + + struct ggml_allocr * allocr = NULL; + // allocate the compute buffer + { + allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); + + // create the worst case graph for memory usage estimation + int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); + int n_past = model.hparams.n_ctx - n_tokens; + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; + + // recreate the allocator with the required memory + ggml_allocr_free(allocr); + // compute_buffer.resize(mem_size); + compute_buffer = ggml_cuda_host_malloc(mem_size); + // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); + allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); + + fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); + for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { + printf("%d ", embd_inp[i]); + } + printf("\n\n"); + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + int cnt = 0; + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { + printf("Failed to predict\n"); + return 1; + } + cnt += 1; + + if (cnt > 0) + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + llama_sampling_params & sparams = params.sparams; + const int top_k = sparams.top_k; + const float top_p = sparams.top_p; + const float temp = sparams.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (int32_t(embd.size()) >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // end of text token + if (embd.back() == 50256) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/examples/gpt-2-sparse/main7b.cpp b/examples/gpt-2-sparse/main7b.cpp new file mode 100644 index 00000000..a07a5472 --- /dev/null +++ b/examples/gpt-2-sparse/main7b.cpp @@ -0,0 +1,1567 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-cuda.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif +typedef void (*offload_func_t)(struct ggml_tensor * tensor); +void opt_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} +// default hparams (GPT-2 117M) +struct gpt2_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; + float eps = 1e-5f; +}; + +struct gpt2_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + // struct ggml_tensor * c_attn_attn_w; + // struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_attn_q_w; + struct ggml_tensor * c_attn_attn_q_b; + + struct ggml_tensor * c_attn_attn_k_w; + struct ggml_tensor * c_attn_attn_k_b; + + struct ggml_tensor * c_attn_attn_v_w; + struct ggml_tensor * c_attn_attn_v_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; + + struct ggml_tensor * gpu_idx; + struct ggml_tensor * gpu_bucket; + // gpu heat + struct ggml_tensor * c_mlp_fc_w_gpu; + struct ggml_tensor * c_mlp_proj_w_t; + struct ggml_tensor * c_mlp_proj_w_gpu; + + //predictor + struct ggml_tensor * mlp_pre_w1_w; + struct ggml_tensor * mlp_pre_w2_w; +}; + +struct opt_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + opt_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error("opt_file fail\n"); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + ~opt_file() { + if (fp) { + std::fclose(fp); + } + } +}; +#define _POSIX_MAPPED_FILES +#include +#include + +struct opt_mmap { + void * addr; + size_t size; + + opt_mmap(const opt_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error("mmap failed\n"); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~opt_mmap() { + munmap(addr, size); + } +#else + static constexpr bool SUPPORTED = false; + + opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +struct gpt2_model { + gpt2_hparams hparams; + struct opt_file * file; + struct opt_mmap * mapping; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx; + std::map tensors; +}; + +struct ggml_context * ctx0 = nullptr; +// std::vector compute_buffer; +void *compute_buffer; + +bool endsWith(const std::string& str, const std::string& suffix) { + if (str.length() < suffix.length()) { + return false; + } + return str.substr(str.length() - suffix.length()) == suffix; +} + + +// load the model's weights from a file +bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + model.file = new opt_file(fname.c_str(), "rb"); + printf("size %d\n", model.file->size); + model.mapping = new opt_mmap(model.file, 0, false); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + /* int32_t n_vocab = 0; */ + /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ + + /* if (n_vocab != model.hparams.n_vocab) { */ + /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ + /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ + /* return false; */ + /* } */ + int32_t n_vocab = model.hparams.n_vocab; + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + printf("wtype %d\n", wtype); + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + //need refactor + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 + ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); + //predictor + ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w + ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size = 0; + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 12*n_layer)*51200; // object overhead + + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + int main_gpu = 0; +#if defined(GGML_USE_CUBLAS) + fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + ggml_cuda_set_main_device(main_gpu); +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT +#else +#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU +#endif + + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; + // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; + + // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); + // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // model.lm_head->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["output_norm.weight"] = &model.ln_f_g; + model.tensors["output_norm.bias"] = &model.ln_f_b; + + model.tensors["tok_embeddings.weight"] = &model.wte; + model.tensors["pos_embeddings.weight"] = &model.wpe; + model.tensors["output.weight"] = &model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + memset(&layer, 0, sizeof(gpt2_layer)); + + // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + // // need refine + // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); + // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); + // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); + + // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); + // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); + + // if (i <= 10) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); + // } else if (i <= 12) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); + // } else if (i <= 18) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); + + // } else if (i <= 21) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); + // } else if (i <= 26) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); + // } else if (i <= 31) { + // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); + // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); + // } + + // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; + // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; + + // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; + // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; + // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; + // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; + + // map by name + model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; + model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; + + model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; + model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; + model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; + model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; + model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; + + model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; + model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; + + model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; + model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; + model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; + + model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; + model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; + } + } + + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + #ifdef GGML_USE_CUBLAS + ggml_cuda_assign_buffers_no_scratch(model.memory_k); + ggml_cuda_assign_buffers_no_scratch(model.memory_v); + #endif + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + ggml_set_no_alloc(ctx, true); + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + const std::vector to_gpu = { + "output_norm.bias", + "output_norm.weight", + ".*attention.wq.weight", + ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + // ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + "layers.*output_norm.weight", + "layers.*output_norm.bias", + ".*fc1.weight", + ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_gpu_lv = { + ".*attention.wq.weight", + ".*attention.wq.bias", + ".*attention.wk.weight", + ".*attention.wk.bias", + ".*attention.wv.weight", + ".*attention.wv.bias", + ".*attention.wo.weight", + ".*attention.wo.weight_transpose", + ".*attention.wo.bias", + ".*feed_forward.w1.weight_h20", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_h20$", + // ".*feed_forward.w2.weight_transpose", + /* ".*feed_forward.w2.weight$", */ + ".*feed_forward.w2.bias", + ".*gpu.bucket", + ".*attention_norm.weight", + ".*attention_norm.bias", + // "layers.*output_norm.weight", + // "layers.*output_norm.bias", + // ".*fc1.weight", + // ".*fc2.weight", + // ".*attention.*fc1.weight", + // ".*attention.*fc1.bias", + // ".*attention.*fc2.weight", + // ".*attention.*fc2.bias", + + // "output.weight", + + // "model/h.*/attn/c_proj/w", + // "model/h.*/mlp/c_fc/w", + // "model/h.*/mlp/c_proj/w", + }; + const std::vector to_lock = { + "tok_embeddings.weight", + "pos_embeddings.weight", + // "output_norm.bias", + ".*attention.wq.weight", + ".*attention.wq.bias", + // ".*attention.wo.weight", + // ".*attention.wo.weight_transpose", + // ".*attention.wo.bias", + ".*feed_forward.w1.weight", + ".*feed_forward.w1.bias", + ".*feed_forward.w2.weight_transpose", + // ".*feed_forward.w2.weight", + ".*feed_forward.w2.bias", + ".*gpu.weight", + ".*attention_norm.weight", + ".*attention_norm.bias", + ".*output_norm.weight", + ".*output_norm.bias", + ".*attention.*fc1.weight", + ".*attention.*fc1.bias", + ".*attention.*fc2.weight", + ".*attention.*fc2.bias", + // ".*w2.bias", + // ".*w1.bias", + "output.weight", + }; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + int64_t new_ne[2]; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + new_ne[i] = ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); + return false; + } + ggml_tensor ** ptr = model.tensors[name]; + // printf("name %s ptr %p\n", name.c_str(), *ptr); + // int k; + // scanf("%d", &k); + *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); + + auto tensor = (ggml_tensor *)*model.tensors[name]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + std::streampos offset = fin.tellg(); + // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + tensor->data = model.mapping->addr + static_cast(offset); + // if ( endsWith(name.c_str(), "weight_transpose")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i+4096]); + // } + // } + // printf("\n"); + // if (endsWith(name.c_str(), "weight_h20")) { + // short *d = (short *)tensor->data; + // for (int i = 0; i < 10; i++) { + // printf("%d ", d[i]); + + // } + // int k; + // scanf("%d", &k); + // } + + // // GPT-2 models share the WTE tensor as the LM head + // if (name == "model/wte" && has_lm_head == false) { + // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + // } + + // if (name == "model/lm_head") { + // has_lm_head = true; + // } + if (model_params.low_vram == false) { + for (const auto &s : to_gpu) + { + if (std::regex_search(name, std::regex(s))) + { + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + } else { + for (const auto &s : to_gpu_lv) + { + if (std::regex_search(name, std::regex(s))) + { + std::regex pattern(R"(\d+)"); + std::smatch match; + int layer_id = 0; + if (std::regex_search(name, match, pattern)) + { + std::string digitStr = match.str(); + int num = std::stoi(digitStr); + layer_id = num; + } + // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); + if (layer_id > model_params.n_gpu_layers) + break; + // printf("name %s\n", name.c_str()); + tensor->backend = GGML_BACKEND_GPU; + break; + } + } + + } + if (tensor->backend == GGML_BACKEND_GPU) { + #if defined(GGML_USE_CUBLAS) + ggml_cuda_transform_tensor(tensor->data, tensor); + #endif + } + for (const auto &s : to_lock) + { + if (std::regex_match(name, std::regex(s))) + { + if(!mlock(tensor->data, ggml_nbytes(tensor))) { + // printf("mlock %s\n", name.c_str()); + } + else { + printf("mlock failed %s\n", name.c_str()); + } + } + } + + total_size += ggml_nbytes(tensor); + } + ggml_set_no_alloc(ctx, false); + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + + return true; +} + +// build the computation graph +struct ggml_cgraph * gpt2_graph( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_past, + const std::vector & embd_inp) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + // static std::vector buf(buf_size); + static void * buf = ggml_cuda_host_malloc(buf_size); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + ctx0 = ggml_init(params); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, embd); + + // avoid writing to tensors if we are only measuring the memory usage + if (!ggml_allocr_is_measure(allocr)) { + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, position); + if (!ggml_allocr_is_measure(allocr)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i + 2; + } + } + offload_func_t offload_func = opt_nop; + offload_func_t offload_func_kq = opt_nop; + offload_func_t offload_func_v = opt_nop; + offload_func_t offload_func_nr = opt_nop; + offload_func_t offload_debug = opt_nop; +#ifdef GGML_USE_CUBLAS + offload_debug = ggml_cuda_assign_buffers_no_alloc; + offload_func = ggml_cuda_assign_buffers_no_alloc; + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; +#endif + // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; + // int k; + // scanf("%d", &k); + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(allocr, KQ_scale); + if (!ggml_allocr_is_measure(allocr)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + ggml_set_name(inpL, "inpL_first"); + // offload_func(inpL); + + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL, hparams.eps); + offload_func(cur); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_1_g); + offload_func(cur); + ggml_set_name(cur, "ln_1_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_1_b); + ggml_set_name(cur, "ln_1_b"); + // offload_func(cur); + + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + + struct ggml_tensor *k_cpy = nullptr; + struct ggml_tensor *v_cpy = nullptr; + // self-attention + { + // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); + offload_func_kq(Qcur); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); + offload_func_kq(Kcur); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); + offload_func_kq(Kcur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); + offload_func_v(Vcur); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); + offload_func_v(Vcur); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + offload_func_v(Vcur); + + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + offload_func_kq(k); + // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); + + offload_func_v(v); + k_cpy = ggml_cpy(ctx0, Kcur, k); + offload_func_kq(k_cpy); + ggml_set_name(k_cpy, "k_cpy"); + v_cpy = ggml_cpy(ctx0, Vcur, v); + offload_func_v(v_cpy); + ggml_set_name(v_cpy, "v_cpy"); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); + offload_func_kq(Qcur); + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + offload_func_kq(Q); + + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + // struct ggml_tensor * K = + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, model.memory_k, + 128, n_past + N, n_head, + ggml_element_size(model.memory_k)*n_embd, + ggml_element_size(model.memory_k)*128, + ggml_element_size(model.memory_k)*n_embd*n_ctx*il); + K->src[1] = k_cpy; + offload_func_kq(K); + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + KQ_scale); + offload_func_kq(KQ_scaled); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, 128, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*128, + n_ctx*ggml_element_size(model.memory_k)*n_embd*il); + V->src[1] = v_cpy; + offload_func_v(V); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merge_cont"); + offload_func_v(cur); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + ggml_set_name(cur, "attn_proj"); + offload_func(cur); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_attn_proj_b); + ggml_set_name(cur, "attn_bias"); + offload_func(cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + ggml_set_name(cur, "after attn"); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + ggml_tensor *idx = nullptr; + ggml_tensor *idx_g = nullptr; + ggml_tensor *cur_c = nullptr; + + // norm + { + cur = ggml_norm(ctx0, inpFF, hparams.eps); + offload_func(cur); + ggml_set_name(cur, "norm_FFN"); + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_mul(ctx0, + cur, + model.layers[il].ln_2_g); + offload_func(cur); + ggml_set_name(cur, "norm_FFN_g"); + cur = ggml_add(ctx0, + cur, + model.layers[il].ln_2_b); + // offload_func(cur); + // ggml_set_name(cur, "norm_FFN_w"); + // cur_c = ggml_dup(ctx0, cur); + } + // if (N == 1) + if (1) + { + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w1_w, + cur); + offload_func(idx); + ggml_set_name(idx, "mlp_pre_w1"); + idx = ggml_relu(ctx0, idx); + offload_func(idx); + ggml_set_name(idx, "relu_pre"); + idx = ggml_mul_mat(ctx0, + model.layers[il].mlp_pre_w2_w, + idx); + ggml_set_name(idx, "mlp_pre_w2"); + // offload_func(idx); + // idx = ggml_sigmoid(ctx0, idx); + // offload_func(idx); + // idx_g = idx; + // idx = ggml_dup(ctx0, idx_g); + // ggml_set_name(idx, "idx_cpu_dup"); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + if (N >= 80) + // if (0) + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + offload_func(cur); + ggml_set_name(cur, "up_ffn"); + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_fc_b); + offload_func(cur); + } + else + { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_fc_w, + // cur); + // offload_func(cur); + // cur = ggml_add(ctx0, + // cur, + // model.layers[il].c_mlp_fc_b); + // offload_func(cur); + + + struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, + model.layers[il].c_mlp_fc_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "mlp_up_gpu"); + offload_func(tmp); + offload_debug(tmp); + cur = ggml_mul_mat_idx(ctx0, + model.layers[il].c_mlp_fc_w, + cur, + idx, + model.layers[il].gpu_idx); + ggml_set_name(cur, "mlp_up_cpu"); + tmp = ggml_add_idx(ctx0, + tmp, + model.layers[il].c_mlp_fc_b, + idx); + ggml_set_name(tmp, "mlp_up_bias"); + offload_debug(tmp); + offload_func(tmp); + + cur = ggml_add(ctx0, cur, tmp); + ggml_set_name(cur, "mlp_up_mix"); + offload_func(cur); + + // cur = tmp; + + } + + + + // GELU activation + // [3072, N] + cur = ggml_relu(ctx0, cur); + // cur_c = cur; + // offload_func(cur); + cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + if (N >= 80) { + // if (0) { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + offload_func(cur); + ggml_set_name(cur, "down_ffn"); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_proj_b); + offload_func(cur); + } + else { + // cur = ggml_mul_mat(ctx0, + // model.layers[il].c_mlp_proj_w, + // cur); + // offload_func(cur); + + // cur = ggml_axpy(ctx0, + // model.layers[il].c_mlp_proj_w_t, + // cur, + // NULL, + // NULL); + // offload_func(cur); + + + // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, + // model.layers[il].c_mlp_proj_w_gpu, + // cur, + // model.layers[il].gpu_bucket, + // NULL); + struct ggml_tensor *tmp = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_gpu, + cur, + idx, + model.layers[il].gpu_bucket); + ggml_set_name(tmp, "axpy"); + offload_func(tmp); + offload_debug(tmp); + + cur = ggml_axpy(ctx0, + model.layers[il].c_mlp_proj_w_t, + cur_c, + idx, + model.layers[il].gpu_idx); + + cur = ggml_add(ctx0, cur, tmp); + offload_func(cur); + + cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); + offload_func(cur); + + // tmp = ggml_add(ctx0, + // tmp, + // model.layers[il].c_mlp_proj_b); + // offload_func(tmp); + // offload_debug(tmp); + + // cur = tmp; + } + + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + offload_func(inpL); + } + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL, hparams.eps); + offload_func_nr(inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_mul(ctx0, + inpL, + model.ln_f_g); + offload_func_nr(inpL); + inpL = ggml_add(ctx0, + inpL, + model.ln_f_b); + ggml_set_name(inpL, "before"); + offload_func_nr(inpL); + } + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + ggml_set_name(inpL, "last_layer"); +// offload_func(inpL); + + // logits -> probs + //inpL = ggml_soft_max(ctx0, inpL); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - model: the model +// - allocr: ggml_allocr to use to allocate the compute buffer +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt2_eval( + const gpt2_model & model, + struct ggml_allocr * allocr, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + // reset the allocator to free all the memory allocated during the previous inference + ggml_allocr_reset(allocr); + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); + + // allocate tensors + ggml_allocr_alloc_graph(allocr, gf); + +#ifdef GGML_USE_CUBLAS + for (int i = 0; i < gf->n_leafs; i++) { + ggml_tensor * node = gf->leafs[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } + + for (int i = 0; i < gf->n_nodes; i++) { + ggml_tensor * node = gf->nodes[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); + } + } +#endif + + + + // run the computation + struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); + static std::vector work_buffer; + work_buffer.resize(plan.work_size); + plan.work_data = work_buffer.data(); + ggml_graph_compute(gf, &plan); + + //if (n_past%100 == 0) { + // ggml_graph_print (gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + // in this case, the output tensor is the last one in the graph + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + gpt2_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_model_load(params.model, model, vocab, params)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, "hello world"); + } + printf("load finish\n"); + + // keep this buffer alive while evaluating the model + + struct ggml_allocr * allocr = NULL; + // allocate the compute buffer + { + allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); + + // create the worst case graph for memory usage estimation + int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); + int n_past = model.hparams.n_ctx - n_tokens; + struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; + + // recreate the allocator with the required memory + ggml_allocr_free(allocr); + // compute_buffer.resize(mem_size); + compute_buffer = ggml_cuda_host_malloc(mem_size); + // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); + allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); + + fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); + for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { + printf("%d ", embd_inp[i]); + } + printf("\n\n"); + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + int cnt = 0; + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { + printf("Failed to predict\n"); + return 1; + } + cnt += 1; + + if (cnt > 0) + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + llama_sampling_params & sparams = params.sparams; + const int top_k = sparams.top_k; + const float top_p = sparams.top_p; + const float temp = sparams.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (int32_t(embd.size()) >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // end of text token + if (embd.back() == 50256) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/examples/gpt-2-sparse/quantize.cpp b/examples/gpt-2-sparse/quantize.cpp new file mode 100644 index 00000000..f81c04e8 --- /dev/null +++ b/examples/gpt-2-sparse/quantize.cpp @@ -0,0 +1,184 @@ +#include "ggml.h" + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// default hparams (GPT-2 117M) +struct gpt2_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; +}; + +// quantize a model +bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { + gpt_vocab vocab; + + printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); + + auto finp = std::ifstream(fname_inp, std::ios::binary); + if (!finp) { + fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); + return false; + } + + auto fout = std::ofstream(fname_out, std::ios::binary); + if (!fout) { + fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + finp.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); + return false; + } + + fout.write((char *) &magic, sizeof(magic)); + } + + gpt2_hparams hparams; + + // load hparams + { + finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); + + fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); + fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); + } + + // load vocab + { + int32_t n_vocab = 0; + finp.read ((char *) &n_vocab, sizeof(n_vocab)); + fout.write((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); + return false; + } + + std::string word; + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + finp.read ((char *) &len, sizeof(len)); + fout.write((char *) &len, sizeof(len)); + + word.resize(len); + finp.read ((char *) word.data(), len); + fout.write((char *) word.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // regexes of tensor names to be quantized + const std::vector to_quant = { + "model/wte", + "model/lm_head", + "model/h.*/attn/c_attn/w", + "model/h.*/attn/c_proj/w", + "model/h.*/mlp/c_fc/w", + "model/h.*/mlp/c_proj/w", + }; + + if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { + fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); + return false; + } + + finp.close(); + fout.close(); + + return true; +} + +// usage: +// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type +// +int main(int argc, char ** argv) { + if (argc != 4) { + fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); + ggml_print_ftypes(stderr); + return 1; + } + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + + const std::string fname_inp = argv[1]; + const std::string fname_out = argv[2]; + + const ggml_ftype ftype = ggml_parse_ftype(argv[3]); + + const int64_t t_main_start_us = ggml_time_us(); + + int64_t t_quantize_us = 0; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { + fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); + return 1; + } + + t_quantize_us = ggml_time_us() - t_start_us; + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + return 0; +} From 98d035ebab161d4b3fedbdc98bbf1908e6621481 Mon Sep 17 00:00:00 2001 From: "a.r.l" Date: Thu, 16 Jan 2025 16:53:19 +0800 Subject: [PATCH 2/4] feat: add opt model into llama.cpp --- examples/gpt-2-sparse/CMakeLists.txt | 15 - examples/gpt-2-sparse/README.md | 158 -- .../gpt-2-sparse/convert-cerebras-to-ggml.py | 183 -- examples/gpt-2-sparse/convert-ckpt-to-ggml.py | 159 -- examples/gpt-2-sparse/convert-h5-to-ggml.py | 195 -- examples/gpt-2-sparse/download-ggml-model.sh | 69 - examples/gpt-2-sparse/download-model.sh | 48 - examples/gpt-2-sparse/main-30b.cpp | 1593 ----------------- examples/gpt-2-sparse/main.cpp_123 | 1592 ---------------- examples/gpt-2-sparse/main.cpp_bak | 1546 ---------------- examples/gpt-2-sparse/main13b.cpp | 1583 ---------------- examples/gpt-2-sparse/main7b.cpp | 1567 ---------------- examples/gpt-2-sparse/quantize.cpp | 184 -- llama.cpp | 213 +++ 14 files changed, 213 insertions(+), 8892 deletions(-) delete mode 100644 examples/gpt-2-sparse/CMakeLists.txt delete mode 100644 examples/gpt-2-sparse/README.md delete mode 100644 examples/gpt-2-sparse/convert-cerebras-to-ggml.py delete mode 100644 examples/gpt-2-sparse/convert-ckpt-to-ggml.py delete mode 100644 examples/gpt-2-sparse/convert-h5-to-ggml.py delete mode 100755 examples/gpt-2-sparse/download-ggml-model.sh delete mode 100755 examples/gpt-2-sparse/download-model.sh delete mode 100644 examples/gpt-2-sparse/main-30b.cpp delete mode 100644 examples/gpt-2-sparse/main.cpp_123 delete mode 100644 examples/gpt-2-sparse/main.cpp_bak delete mode 100644 examples/gpt-2-sparse/main13b.cpp delete mode 100644 examples/gpt-2-sparse/main7b.cpp delete mode 100644 examples/gpt-2-sparse/quantize.cpp diff --git a/examples/gpt-2-sparse/CMakeLists.txt b/examples/gpt-2-sparse/CMakeLists.txt deleted file mode 100644 index a06b42dc..00000000 --- a/examples/gpt-2-sparse/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# -# gpt-2 - -set(TEST_TARGET gpt-2-sparse) -add_executable(${TEST_TARGET} main7b.cpp) -# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) -target_link_libraries(${TEST_TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) - -# -# gpt-2-quantize - -set(TEST_TARGET gpt-2-quantize) -add_executable(${TEST_TARGET} quantize.cpp) -# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common) diff --git a/examples/gpt-2-sparse/README.md b/examples/gpt-2-sparse/README.md deleted file mode 100644 index 509fabc5..00000000 --- a/examples/gpt-2-sparse/README.md +++ /dev/null @@ -1,158 +0,0 @@ -# gpt-2 - -This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library. - -The program runs on the CPU - no video card is required. - -The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported. - -The example supports the following GPT-2 models: - -| Model | Description | Disk Size | -| --- | --- | --- | -| 117M | Small model | 240 MB | -| 345M | Medium model | 680 MB | -| 774M | Large model | 1.5 GB | -| 1558M | XL model | 3.0 GB | - -Sample performance on MacBook M1 Pro: - -| Model | Size | Time / Token | -| --- | --- | --- | -| GPT-2 | 117M | 5 ms | -| GPT-2 | 345M | 12 ms | -| GPT-2 | 774M | 23 ms | -| GPT-2 | 1558M | 42 ms | - -*TODO: add tables for Cerebras-GPT models* - -Sample output: - -``` -$ ./bin/gpt-2 -h -usage: ./bin/gpt-2 [options] - -options: - -h, --help show this help message and exit - -s SEED, --seed SEED RNG seed (default: -1) - -t N, --threads N number of threads to use during computation (default: 8) - -p PROMPT, --prompt PROMPT - prompt to start generation with (default: random) - -n N, --n_predict N number of tokens to predict (default: 200) - --top_k N top-k sampling (default: 40) - --top_p N top-p sampling (default: 0.9) - --temp N temperature (default: 1.0) - -b N, --batch_size N batch size for prompt processing (default: 8) - -m FNAME, --model FNAME - model path (default: models/gpt-2-117M/ggml-model.bin) - -$ ./bin/gpt-2 -gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin' -gpt2_model_load: n_vocab = 50257 -gpt2_model_load: n_ctx = 1024 -gpt2_model_load: n_embd = 768 -gpt2_model_load: n_head = 12 -gpt2_model_load: n_layer = 12 -gpt2_model_load: f16 = 1 -gpt2_model_load: ggml ctx size = 311.12 MB -gpt2_model_load: memory size = 72.00 MB, n_mem = 12288 -gpt2_model_load: model size = 239.08 MB -main: number of tokens in prompt = 1 - -So this is going to be the end of the line for us. - -If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan. - -Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs. - -We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|> - -main: mem per token = 2048612 bytes -main: load time = 106.32 ms -main: sample time = 7.10 ms -main: predict time = 506.40 ms / 5.06 ms per token -main: total time = 629.84 ms -``` - -## Downloading and converting the original models (GPT-2) - -You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are -in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done -via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script. - -Here is the entire process for the GPT-2 117M model (download from official site + conversion): - -``` -cd ggml/build -../examples/gpt-2/download-model.sh 117M - -Downloading model 117M ... -models/gpt-2-117M/checkpoint 100%[=============================>] 77 --.-KB/s in 0s -models/gpt-2-117M/encoder.json 100%[=============================>] 1018K 1.20MB/s in 0.8s -models/gpt-2-117M/hparams.json 100%[=============================>] 90 --.-KB/s in 0s -models/gpt-2-117M/model.ckpt.data-00000-of-00001 100%[=============================>] 474.70M 1.21MB/s in 8m 39s -models/gpt-2-117M/model.ckpt.index 100%[=============================>] 5.09K --.-KB/s in 0s -models/gpt-2-117M/model.ckpt.meta 100%[=============================>] 460.11K 806KB/s in 0.6s -models/gpt-2-117M/vocab.bpe 100%[=============================>] 445.62K 799KB/s in 0.6s -Done! Model '117M' saved in 'models/gpt-2-117M/' - -Run the convert-ckpt-to-ggml.py script to convert the model to ggml format. - - python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1 - -``` - -This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid -this, you can download the already converted ggml models as described below. - -## Downloading and converting the original models (Cerebras-GPT) - -Clone the respective repository from here: https://huggingface.co/cerebras - -Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format: - -``` -cd ggml/build -git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/ -python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/ - -``` - -## Downloading the ggml model directly (GPT-2) - -For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This -way, you can directly download a single binary file and start using it. No python or Tensorflow is required. - -Here is how to get the 117M ggml model: - -``` -cd ggml/build -../examples/gpt-2/download-ggml-model.sh 117M - -Downloading ggml model 117M ... -models/gpt-2-117M/ggml-model.bin 100%[===============================>] 239.58M 8.52MB/s in 28s -Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin' -You can now use it like this: - - $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example" - -``` - -At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above. - -## Quantizing the models - -You can also try to quantize the `ggml` models via 4-bit integer quantization. -Keep in mind that for smaller models, this will render them completely useless. -You generally want to quantize larger models. - -``` -# quantize GPT-2 F16 to Q4_0 (faster but less precise) -./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2 -./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example" - -# quantize Cerebras F16 to Q4_1 (slower but more precise) -./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3 -./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example" - -``` diff --git a/examples/gpt-2-sparse/convert-cerebras-to-ggml.py b/examples/gpt-2-sparse/convert-cerebras-to-ggml.py deleted file mode 100644 index 6057f81c..00000000 --- a/examples/gpt-2-sparse/convert-cerebras-to-ggml.py +++ /dev/null @@ -1,183 +0,0 @@ -# Convert Cerebras models to ggml format -# -# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/ -# - -import sys -import struct -import json -import torch -import numpy as np -import re - -from transformers import AutoModelForCausalLM - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - -if len(sys.argv) < 2: - print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n") - sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -fname_out = sys.argv[1] + "/ggml-model-f16.bin" - -with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - encoder = json.load(f) - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -# use 16-bit or 32-bit floats -use_f16 = True -if len(sys.argv) > 2: - use_f16 = False - fname_out = sys.argv[1] + "/ggml-model-f32.bin" - -model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) -#print (model) - -list_vars = model.state_dict() -#print (list_vars) - -print(hparams) - -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["vocab_size"])) -fout.write(struct.pack("i", hparams["n_positions"])) -fout.write(struct.pack("i", hparams["n_embd"])) -fout.write(struct.pack("i", hparams["n_head"])) -fout.write(struct.pack("i", hparams["n_layer"])) -fout.write(struct.pack("i", use_f16)) - -byte_encoder = bytes_to_unicode() -byte_decoder = {v:k for k, v in byte_encoder.items()} - -fout.write(struct.pack("i", len(encoder))) - -for key in encoder: - text = bytearray([byte_decoder[c] for c in key]) - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - print("Processing variable: " + name + " with shape: ", data.shape) - - # rename headers to keep compatibility - if name == "transformer.ln_f.weight": - name = "model/ln_f/g" - elif name == "transformer.ln_f.bias": - name = "model/ln_f/b" - elif name == "transformer.wte.weight": - name = "model/wte" - elif name == "transformer.wpe.weight": - name = "model/wpe" - elif name == "lm_head.weight": - name = "model/lm_head" - elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_1/g" - elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_1/b" - elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_attn/w" - elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_attn/b" - elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_proj/w" - elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_proj/b" - elif re.match(r"transformer.h.\d+.ln_2.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_2/g" - elif re.match(r"transformer.h.\d+.ln_2.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_2/b" - elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_fc/w" - elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_fc/b" - elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_proj/w" - elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_proj/b" - else: - print("Unrecognized variable name. %s", name) - - # we don't need these - if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"): - print(" Skipping variable: " + name) - continue - - n_dims = len(data.shape); - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype = 0; - if use_f16: - if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype = 0 - - # for efficiency - transpose the projection matrices - # "model/h.*/attn/c_attn/w" - # "model/h.*/attn/c_proj/w" - # "model/h.*/mlp/c_fc/w" - # "model/h.*/mlp/c_proj/w" - if name[-14:] == "/attn/c_attn/w" or \ - name[-14:] == "/attn/c_proj/w" or \ - name[-11:] == "/mlp/c_fc/w" or \ - name[-13:] == "/mlp/c_proj/w": - print(" Transposing") - data = data.transpose() - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str); - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/gpt-2-sparse/convert-ckpt-to-ggml.py b/examples/gpt-2-sparse/convert-ckpt-to-ggml.py deleted file mode 100644 index 9113141f..00000000 --- a/examples/gpt-2-sparse/convert-ckpt-to-ggml.py +++ /dev/null @@ -1,159 +0,0 @@ -# Convert a model checkpoint to a ggml compatible file -# -# Load the model using TensorFlow. -# Iterate over all variables and write them to a binary file. -# -# For each variable, write the following: -# - Number of dimensions (int) -# - Name length (int) -# - Dimensions (int[n_dims]) -# - Name (char[name_length]) -# - Data (float[n_dims]) -# -# By default, the bigger matrices are converted to 16-bit floats. -# This can be disabled by adding the "use-f32" CLI argument. -# -# At the start of the ggml file we write the model parameters -# and vocabulary. -# - -import sys -import json -import struct -import numpy as np -import tensorflow as tf - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - -# helper method to convert a numpy array to different float types -def convert_to_ftype(data, ftype): - # fp16 - if ftype == 1: - return data.astype(np.float16) - - assert False, "Invalid ftype: " + str(ftype) - -if len(sys.argv) < 3: - print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") - sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -fname_out = sys.argv[1] + "/ggml-model.bin" - -with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f: - encoder = json.load(f) - -with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -# possible data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 -# -# map from ftype to string -ftype_str = ["f32", "f16"] - -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) - sys.exit(1) - fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" - -list_vars = tf.train.list_variables(dir_model) - -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["n_vocab"])) -fout.write(struct.pack("i", hparams["n_ctx"])) -fout.write(struct.pack("i", hparams["n_embd"])) -fout.write(struct.pack("i", hparams["n_head"])) -fout.write(struct.pack("i", hparams["n_layer"])) -fout.write(struct.pack("i", ftype)) - -byte_encoder = bytes_to_unicode() -byte_decoder = {v:k for k, v in byte_encoder.items()} - -fout.write(struct.pack("i", len(encoder))) - -for key in encoder: - text = bytearray([byte_decoder[c] for c in key]) - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for name, shape in list_vars: - print("Processing variable: " + name + " with shape: ", shape) - - data = tf.train.load_variable(dir_model, name).squeeze() - n_dims = len(data.shape); - - # for efficiency - transpose the projection matrices - # "model/h.*/attn/c_attn/w" - # "model/h.*/attn/c_proj/w" - # "model/h.*/mlp/c_fc/w" - # "model/h.*/mlp/c_proj/w" - if name[-14:] == "/attn/c_attn/w" or \ - name[-14:] == "/attn/c_proj/w" or \ - name[-11:] == "/mlp/c_fc/w" or \ - name[-13:] == "/mlp/c_proj/w": - print(" Transposing") - data = data.transpose() - - dshape = data.shape - - ftype_cur = 0 - if ftype != 0: - # match name: - # "model/wte" - # "model/h.*/attn/c_attn/w" - # "model/h.*/attn/c_proj/w" - # "model/h.*/mlp/c_fc/w" - # "model/h.*/mlp/c_proj/w" - if name == "model/wte" or name[-2:] == "/w": - print(" Converting to " + ftype_str[ftype]) - data = convert_to_ftype(data, ftype) - ftype_cur = ftype - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", dshape[n_dims - 1 - i])) - fout.write(str); - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/gpt-2-sparse/convert-h5-to-ggml.py b/examples/gpt-2-sparse/convert-h5-to-ggml.py deleted file mode 100644 index 6a2b8654..00000000 --- a/examples/gpt-2-sparse/convert-h5-to-ggml.py +++ /dev/null @@ -1,195 +0,0 @@ -# Convert GPT-2 h5 transformer model to ggml format -# -# Load the model using GPT2Model. -# Iterate over all variables and write them to a binary file. -# -# For each variable, write the following: -# - Number of dimensions (int) -# - Name length (int) -# - Dimensions (int[n_dims]) -# - Name (char[name_length]) -# - Data (float[n_dims]) -# -# By default, the bigger matrices are converted to 16-bit floats. -# This can be disabled by adding the "use-f32" CLI argument. -# -# At the start of the ggml file we write the model parameters -# and vocabulary. -# - -import sys -import struct -import json -import numpy as np -import re - -from transformers import GPT2Model - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - -if len(sys.argv) < 2: - print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") - sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -fname_out = sys.argv[1] + "/ggml-model.bin" - -with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: - encoder = json.load(f) - -with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: - encoder_added = json.load(f) - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -# use 16-bit or 32-bit floats -use_f16 = True -if len(sys.argv) > 2: - use_f16 = False - fname_out = sys.argv[1] + "/ggml-model-f32.bin" - -model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True) -#print (model) - -list_vars = model.state_dict() -#print (list_vars) - -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["vocab_size"])) -fout.write(struct.pack("i", hparams["n_positions"])) -fout.write(struct.pack("i", hparams["n_embd"])) -fout.write(struct.pack("i", hparams["n_head"])) -fout.write(struct.pack("i", hparams["n_layer"])) -#fout.write(struct.pack("i", hparams["rotary_dim"])) -fout.write(struct.pack("i", use_f16)) - -byte_encoder = bytes_to_unicode() -byte_decoder = {v:k for k, v in byte_encoder.items()} - -fout.write(struct.pack("i", len(encoder) + len(encoder_added))) - -for key in encoder: - text = bytearray([byte_decoder[c] for c in key]) - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for key in encoder_added: - text = bytearray([byte_decoder[c] for c in key]) - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - print("Processing variable: " + name + " with shape: ", data.shape) - - # we don't need these - if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"): - print(" Skipping variable: " + name) - continue - - n_dims = len(data.shape); - - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype = 0; - if use_f16: - if name[-7:] == ".weight" and n_dims == 2: - print(" Converting to float16") - data = data.astype(np.float16) - ftype = 1 - else: - print(" Converting to float32") - data = data.astype(np.float32) - ftype = 0 - - # for efficiency - transpose these matrices: - # "transformer.h.*.mlp.c_proj.weight - if name.endswith(".mlp.c_proj.weight"): - print(" Transposing") - data = data.transpose() - - # rename headers to keep compatibility - if name == "ln_f.weight": - name = "model/ln_f/g" - elif name == "ln_f.bias": - name = "model/ln_f/b" - elif name == "wte.weight": - name = "model/wte" - elif name == "wpe.weight": - name = "model/wpe" - elif re.match(r"h\.\d+\.ln_1\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_1/g" - elif re.match(r"h\.\d+\.ln_1\.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_1/b" - elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_attn/w" - elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_attn/b" - elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_proj/w" - elif re.match(r"h.\d+.attn.c_proj.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/attn/c_proj/b" - elif re.match(r"h.\d+.ln_2.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_2/g" - elif re.match(r"h.\d+.ln_2.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/ln_2/b" - elif re.match(r"h.\d+.mlp.c_fc.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_fc/w" - elif re.match(r"h.\d+.mlp.c_fc.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_fc/b" - elif re.match(r"h.\d+.mlp.c_proj.weight", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_proj/w" - elif re.match(r"h.\d+.mlp.c_proj.bias", name): - i = re.findall("\d+", name)[0] - name = f"model/h{i}/mlp/c_proj/b" - else: - print("Unrecognized variable name. %s", name) - - str = name.encode('utf-8') - - fout.write(struct.pack("iii", n_dims, len(str), ftype)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str); - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/gpt-2-sparse/download-ggml-model.sh b/examples/gpt-2-sparse/download-ggml-model.sh deleted file mode 100755 index 3aae015b..00000000 --- a/examples/gpt-2-sparse/download-ggml-model.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -# This script downloads GPT-2 model files that have already been converted to ggml format. -# This way you don't have to convert them yourself. -# -# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead. - -#src="https://ggml.ggerganov.com" -#pfx="ggml-model-gpt-2" - -src="https://huggingface.co/ggerganov/ggml" -pfx="resolve/main/ggml-model-gpt-2" - -ggml_path=$(dirname $(realpath $0)) - -# GPT-2 models -models=( "117M" "345M" "774M" "1558M" ) - -# list available models -function list_models { - printf "\n" - printf " Available models:" - for model in "${models[@]}"; do - printf " $model" - done - printf "\n\n" -} - -if [ "$#" -ne 1 ]; then - printf "Usage: $0 \n" - list_models - - exit 1 -fi - -model=$1 - -if [[ ! " ${models[@]} " =~ " ${model} " ]]; then - printf "Invalid model: $model\n" - list_models - - exit 1 -fi - -# download ggml model - -printf "Downloading ggml model $model ...\n" - -mkdir -p models/gpt-2-$model - -if [ -x "$(command -v wget)" ]; then - wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin -elif [ -x "$(command -v curl)" ]; then - curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin -else - printf "Either wget or curl is required to download models.\n" - exit 1 -fi - -if [ $? -ne 0 ]; then - printf "Failed to download ggml model $model \n" - printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n" - exit 1 -fi - -printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n" -printf "You can now use it like this:\n\n" -printf " $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n" -printf "\n" diff --git a/examples/gpt-2-sparse/download-model.sh b/examples/gpt-2-sparse/download-model.sh deleted file mode 100755 index f0c62f4f..00000000 --- a/examples/gpt-2-sparse/download-model.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -ggml_path=$(dirname $(realpath $0)) - -# GPT-2 models -models=( "117M" "345M" "774M" "1558M" ) - -# list available models -function list_models { - printf "\n" - printf " Available models:" - for model in "${models[@]}"; do - printf " $model" - done - printf "\n\n" -} - -if [ "$#" -ne 1 ]; then - printf "Usage: $0 \n" - list_models - - exit 1 -fi - -model=$1 - -if [[ ! " ${models[@]} " =~ " ${model} " ]]; then - printf "Invalid model: $model\n" - list_models - - exit 1 -fi - -# download model - -printf "Downloading model $model ...\n" - -mkdir -p models/gpt-2-$model - -for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do - wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file -done - -printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n" -printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n" -printf "\n" -printf " python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n" -printf "\n" diff --git a/examples/gpt-2-sparse/main-30b.cpp b/examples/gpt-2-sparse/main-30b.cpp deleted file mode 100644 index 73eeff25..00000000 --- a/examples/gpt-2-sparse/main-30b.cpp +++ /dev/null @@ -1,1593 +0,0 @@ -#include "ggml.h" -#include "ggml-alloc.h" -#include - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ggml-cuda.h" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif -typedef void (*offload_func_t)(struct ggml_tensor * tensor); -void opt_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; -} -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct gpt2_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - // struct ggml_tensor * c_attn_attn_w; - // struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_attn_q_w; - struct ggml_tensor * c_attn_attn_q_b; - - struct ggml_tensor * c_attn_attn_k_w; - struct ggml_tensor * c_attn_attn_k_b; - - struct ggml_tensor * c_attn_attn_v_w; - struct ggml_tensor * c_attn_attn_v_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; - - struct ggml_tensor * gpu_idx; - struct ggml_tensor * gpu_bucket; - // gpu heat - struct ggml_tensor * c_mlp_fc_w_gpu; - struct ggml_tensor * c_mlp_proj_w_t; - struct ggml_tensor * c_mlp_proj_w_gpu; - - //predictor - struct ggml_tensor * mlp_pre_w1_w; - struct ggml_tensor * mlp_pre_w2_w; -}; - -struct opt_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - opt_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error("opt_file fail\n"); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - ~opt_file() { - if (fp) { - std::fclose(fp); - } - } -}; -#define _POSIX_MAPPED_FILES -#include -#include - -struct opt_mmap { - void * addr; - size_t size; - - opt_mmap(const opt_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error("mmap failed\n"); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~opt_mmap() { - munmap(addr, size); - } -#else - static constexpr bool SUPPORTED = false; - - opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -struct gpt2_model { - gpt2_hparams hparams; - struct opt_file * file; - struct opt_mmap * mapping; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -struct ggml_context * ctx0 = nullptr; -// std::vector compute_buffer; -void *compute_buffer; - -bool endsWith(const std::string& str, const std::string& suffix) { - if (str.length() < suffix.length()) { - return false; - } - return str.substr(str.length() - suffix.length()) == suffix; -} - - -// load the model's weights from a file -bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); - model.file = new opt_file(fname.c_str(), "rb"); - printf("size %d\n", model.file->size); - model.mapping = new opt_mmap(model.file, 0, false); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - /* int32_t n_vocab = 0; */ - /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ - - /* if (n_vocab != model.hparams.n_vocab) { */ - /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ - /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ - /* return false; */ - /* } */ - int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - printf("wtype %d\n", wtype); - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b - - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b - - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b - - //need refactor - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); - //predictor - ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size = 0; - - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v - - ctx_size += (6 + 12*n_layer)*51200; // object overhead - - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - int main_gpu = 0; -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT -#else -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU -#endif - - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; - // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; - - // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); - // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - // model.lm_head->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["output_norm.weight"] = &model.ln_f_g; - model.tensors["output_norm.bias"] = &model.ln_f_b; - - model.tensors["tok_embeddings.weight"] = &model.wte; - model.tensors["pos_embeddings.weight"] = &model.wpe; - model.tensors["output.weight"] = &model.lm_head; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - memset(&layer, 0, sizeof(gpt2_layer)); - - // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - // // need refine - // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); - // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); - // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); - - // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); - // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); - - // if (i <= 10) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); - // } else if (i <= 12) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); - // } else if (i <= 18) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); - - // } else if (i <= 21) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); - // } else if (i <= 26) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); - // } else if (i <= 31) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); - // } - - // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; - - // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; - // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; - model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; - - model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; - model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; - model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; - model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; - model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; - model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; - - model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; - model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; - model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; - } - } - - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - #ifdef GGML_USE_CUBLAS - // ggml_cuda_assign_buffers_no_scratch(model.memory_k); - // ggml_cuda_assign_buffers_no_scratch(model.memory_v); - #endif - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - ggml_set_no_alloc(ctx, true); - // load weights - { - size_t total_size = 0; - - bool has_lm_head = false; - const std::vector to_gpu = { - "output_norm.bias", - "output_norm.weight", - ".*attention.wq.weight", - ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - // ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - "layers.*output_norm.weight", - "layers.*output_norm.bias", - ".*fc1.weight", - ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_gpu_lv = { - // ".*attention.wq.weight", - // ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - // "layers.*output_norm.weight", - // "layers.*output_norm.bias", - ".*fc1.weight", - ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_lock = { - "tok_embeddings.weight", - "pos_embeddings.weight", - // "output_norm.bias", - ".*attention.wq.weight", - ".*attention.wq.bias", - // ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - // ".*attention.wo.bias", - ".*feed_forward.w1.weight", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_transpose", - // ".*feed_forward.w2.weight", - ".*feed_forward.w2.bias", - ".*gpu.weight", - ".*attention_norm.weight", - ".*attention_norm.bias", - ".*output_norm.weight", - ".*output_norm.bias", - ".*attention.*fc1.weight", - ".*attention.*fc1.bias", - ".*attention.*fc2.weight", - ".*attention.*fc2.bias", - // ".*w2.bias", - // ".*w1.bias", - "output.weight", - }; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - int64_t new_ne[2]; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - new_ne[i] = ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - ggml_tensor ** ptr = model.tensors[name]; - // printf("name %s ptr %p\n", name.c_str(), *ptr); - // int k; - // scanf("%d", &k); - *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); - - auto tensor = (ggml_tensor *)*model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - - // for debugging - if (1) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - std::streampos offset = fin.tellg(); - // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - fin.seekg(ggml_nbytes(tensor), std::ios::cur); - tensor->data = model.mapping->addr + static_cast(offset); - // if ( endsWith(name.c_str(), "weight_transpose")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i+4096]); - // } - // } - // printf("\n"); - // if (endsWith(name.c_str(), "weight_h20")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i]); - - // } - // int k; - // scanf("%d", &k); - // } - - // // GPT-2 models share the WTE tensor as the LM head - // if (name == "model/wte" && has_lm_head == false) { - // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); - // } - - // if (name == "model/lm_head") { - // has_lm_head = true; - // } - if (model_params.low_vram == false) { - for (const auto &s : to_gpu) - { - // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight"))) - // { - // std::regex pattern(R"(\d+)"); - // std::smatch match; - // int layer_id = 0; - // if (std::regex_search(name, match, pattern)) - // { - // std::string digitStr = match.str(); - // int num = std::stoi(digitStr); - // layer_id = num; - // } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - // if (layer_id > model_params.n_gpu_layers) - // break; - // } - if (std::regex_search(name, std::regex(s))) - { - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - } else { - for (const auto &s : to_gpu_lv) - { - if (std::regex_search(name, std::regex(s))) - { - std::regex pattern(R"(\d+)"); - std::smatch match; - int layer_id = 0; - if (std::regex_search(name, match, pattern)) - { - std::string digitStr = match.str(); - int num = std::stoi(digitStr); - layer_id = num; - } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - if (layer_id > model_params.n_gpu_layers) - break; - // printf("name %s\n", name.c_str()); - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - - } - if (tensor->backend == GGML_BACKEND_GPU) { - #if defined(GGML_USE_CUBLAS) - ggml_cuda_transform_tensor(tensor->data, tensor); - #endif - } - for (const auto &s : to_lock) - { - if (std::regex_match(name, std::regex(s))) - { - if(!mlock(tensor->data, ggml_nbytes(tensor))) { - // printf("mlock %s\n", name.c_str()); - } - else { - printf("mlock failed %s\n", name.c_str()); - } - } - } - - total_size += ggml_nbytes(tensor); - } - ggml_set_no_alloc(ctx, false); - - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - } - printf("load finish\n"); - // int k; - // scanf("%d", &k); - - fin.close(); - - return true; -} - -// build the computation graph -struct ggml_cgraph * gpt2_graph( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - - // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data - static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); - // static std::vector buf(buf_size); - static void * buf = ggml_cuda_host_malloc(buf_size); - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - - ctx0 = ggml_init(params); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, embd); - - // avoid writing to tensors if we are only measuring the memory usage - if (!ggml_allocr_is_measure(allocr)) { - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i + 2; - } - } - offload_func_t offload_func = opt_nop; - offload_func_t offload_func_kq = opt_nop; - offload_func_t offload_func_v = opt_nop; - offload_func_t offload_func_nr = opt_nop; - offload_func_t offload_debug = opt_nop; -#ifdef GGML_USE_CUBLAS - offload_debug = ggml_cuda_assign_buffers_no_alloc; - // offload_func = ggml_cuda_assign_buffers_no_alloc; - // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - // offload_func_v = ggml_cuda_assign_buffers_no_alloc; - // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; -#endif - // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; - // int k; - // scanf("%d", &k); - - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); - ggml_set_name(inpL, "inpL_first"); - // offload_func(inpL); - - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); - offload_func(cur); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_1_g); - offload_func(cur); - ggml_set_name(cur, "ln_1_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_1_b); - ggml_set_name(cur, "ln_1_b"); - // offload_func(cur); - - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - - struct ggml_tensor *k_cpy = nullptr; - struct ggml_tensor *v_cpy = nullptr; - // self-attention - { - // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); - offload_func_kq(Qcur); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); - offload_func_kq(Kcur); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); - offload_func_kq(Kcur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); - offload_func_v(Vcur); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); - offload_func_v(Vcur); - - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - offload_func_v(Vcur); - - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - offload_func_kq(k); - // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); - - offload_func_v(v); - k_cpy = ggml_cpy(ctx0, Kcur, k); - offload_func_kq(k_cpy); - ggml_set_name(k_cpy, "k_cpy"); - v_cpy = ggml_cpy(ctx0, Vcur, v); - offload_func_v(v_cpy); - ggml_set_name(v_cpy, "v_cpy"); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); - offload_func_kq(Qcur); - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - offload_func_kq(Q); - - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - // struct ggml_tensor * K = - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 0, 2, 1, 3); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, model.memory_k, - 128, n_past + N, n_head, - ggml_element_size(model.memory_k)*n_embd, - ggml_element_size(model.memory_k)*128, - ggml_element_size(model.memory_k)*n_embd*n_ctx*il); - K->src[1] = k_cpy; - offload_func_kq(K); - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - KQ_scale); - offload_func_kq(KQ_scaled); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, 128, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*128, - n_ctx*ggml_element_size(model.memory_k)*n_embd*il); - V->src[1] = v_cpy; - offload_func_v(V); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merge_cont"); - offload_func_v(cur); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - ggml_set_name(cur, "attn_proj"); - offload_func(cur); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_attn_proj_b); - ggml_set_name(cur, "attn_bias"); - offload_func(cur); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - ggml_set_name(cur, "after attn"); - - struct ggml_tensor * inpFF = cur; - - // feed-forward network - { - ggml_tensor *idx = nullptr; - ggml_tensor *idx_g = nullptr; - ggml_tensor *cur_c = nullptr; - - // norm - { - cur = ggml_norm(ctx0, inpFF, hparams.eps); - offload_func(cur); - ggml_set_name(cur, "norm_FFN"); - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_2_g); - offload_func(cur); - ggml_set_name(cur, "norm_FFN_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_2_b); - // offload_func(cur); - // ggml_set_name(cur, "norm_FFN_w"); - // cur_c = ggml_dup(ctx0, cur); - } - // if (N == 1) - if (1) - { - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w1_w, - inpFF); - offload_func(idx); - ggml_set_name(idx, "mlp_pre_w1"); - idx = ggml_relu(ctx0, idx); - offload_func(idx); - ggml_set_name(idx, "relu_pre"); - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w2_w, - idx); - ggml_set_name(idx, "mlp_pre_w2"); - // offload_func(idx); - // idx = ggml_sigmoid(ctx0, idx); - // offload_func(idx); - // idx_g = idx; - // idx = ggml_dup(ctx0, idx_g); - // ggml_set_name(idx, "idx_cpu_dup"); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - if (N >= 80) - // if (0) - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_fc_w, - cur); - offload_debug(cur); - offload_func(cur); - ggml_set_name(cur, "up_ffn"); - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_fc_b); - offload_debug(cur); - offload_func(cur); - } - else - { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_fc_w, - // cur); - // offload_func(cur); - // cur = ggml_add(ctx0, - // cur, - // model.layers[il].c_mlp_fc_b); - // offload_func(cur); - - - struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, - model.layers[il].c_mlp_fc_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "mlp_up_gpu"); - offload_func(tmp); - offload_debug(tmp); - cur = ggml_mul_mat_idx(ctx0, - model.layers[il].c_mlp_fc_w, - cur, - idx, - model.layers[il].gpu_idx); - ggml_set_name(cur, "mlp_up_cpu"); - cur = ggml_add_idx(ctx0, - cur, - model.layers[il].c_mlp_fc_b, - idx); - ggml_set_name(tmp, "mlp_up_bias"); - offload_debug(tmp); - offload_func(tmp); - - cur = ggml_add(ctx0, cur, tmp); - ggml_set_name(cur, "mlp_up_mix"); - offload_func(cur); - - // cur = tmp; - - } - - - - // GELU activation - // [3072, N] - cur = ggml_relu(ctx0, cur); - // cur_c = cur; - // offload_func(cur); - cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - if (N >= 80) { - // if (0) { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_proj_w, - // cur); - cur = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_t, - cur, - NULL, - NULL); - offload_debug(cur); - offload_func(cur); - ggml_set_name(cur, "down_ffn"); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_proj_b); - offload_func(cur); - offload_debug(cur); - } - else { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_proj_w, - // cur); - // offload_func(cur); - - // cur = ggml_axpy(ctx0, - // model.layers[il].c_mlp_proj_w_t, - // cur, - // NULL, - // NULL); - // offload_func(cur); - - - // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, - // model.layers[il].c_mlp_proj_w_gpu, - // cur, - // model.layers[il].gpu_bucket, - // NULL); - struct ggml_tensor *tmp = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "axpy"); - offload_func(tmp); - offload_debug(tmp); - cur = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_t, - cur_c, - idx, - model.layers[il].gpu_idx); - - cur = ggml_add(ctx0, cur, tmp); - offload_func(cur); - - cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); - offload_func(cur); - - // tmp = ggml_add(ctx0, - // tmp, - // model.layers[il].c_mlp_proj_b); - // offload_func(tmp); - // offload_debug(tmp); - - // cur = tmp; - } - - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - offload_func(inpL); - } - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); - offload_func_nr(inpL); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_mul(ctx0, - inpL, - model.ln_f_g); - offload_func_nr(inpL); - inpL = ggml_add(ctx0, - inpL, - model.ln_f_b); - ggml_set_name(inpL, "before"); - offload_func_nr(inpL); - } - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - ggml_set_name(inpL, "last_layer"); -// offload_func(inpL); - - // logits -> probs - //inpL = ggml_soft_max(ctx0, inpL); - - ggml_build_forward_expand(gf, inpL); - - ggml_free(ctx0); - - return gf; -} - -// evaluate the transformer -// -// - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt2_eval( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_vocab = hparams.n_vocab; - - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); - - // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); - -#ifdef GGML_USE_CUBLAS - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } -#endif - - - - // run the computation - struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); - static std::vector work_buffer; - work_buffer.resize(plan.work_size); - plan.work_data = work_buffer.data(); - ggml_graph_compute(gf, &plan); - - //if (n_past%100 == 0) { - // ggml_graph_print (gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result just for the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/gpt-2-117M/ggml-model.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gpt2_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_model_load(params.model, model, vocab, params)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, "hello world"); - } - printf("load finish\n"); - - // keep this buffer alive while evaluating the model - - struct ggml_allocr * allocr = NULL; - // allocate the compute buffer - { - allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); - - // create the worst case graph for memory usage estimation - int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); - int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - // compute_buffer.resize(mem_size); - compute_buffer = ggml_cuda_host_malloc(mem_size); - // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); - allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); - - fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); - for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { - printf("%d ", embd_inp[i]); - } - printf("\n\n"); - - // submit the input prompt token-by-token - // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning - std::vector embd; - - int cnt = 0; - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { - printf("Failed to predict\n"); - return 1; - } - cnt += 1; - - if (cnt > 0) - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - llama_sampling_params & sparams = params.sparams; - const int top_k = sparams.top_k; - const float top_p = sparams.top_p; - const float temp = sparams.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) >= params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gpt-2-sparse/main.cpp_123 b/examples/gpt-2-sparse/main.cpp_123 deleted file mode 100644 index 4deed1df..00000000 --- a/examples/gpt-2-sparse/main.cpp_123 +++ /dev/null @@ -1,1592 +0,0 @@ -#include "ggml.h" -#include "ggml-alloc.h" -#include - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ggml-cuda.h" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif -typedef void (*offload_func_t)(struct ggml_tensor * tensor); -void opt_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; -} -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct gpt2_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - // struct ggml_tensor * c_attn_attn_w; - // struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_attn_q_w; - struct ggml_tensor * c_attn_attn_q_b; - - struct ggml_tensor * c_attn_attn_k_w; - struct ggml_tensor * c_attn_attn_k_b; - - struct ggml_tensor * c_attn_attn_v_w; - struct ggml_tensor * c_attn_attn_v_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; - - struct ggml_tensor * gpu_idx; - struct ggml_tensor * gpu_bucket; - // gpu heat - struct ggml_tensor * c_mlp_fc_w_gpu; - struct ggml_tensor * c_mlp_proj_w_t; - struct ggml_tensor * c_mlp_proj_w_gpu; - - //predictor - struct ggml_tensor * mlp_pre_w1_w; - struct ggml_tensor * mlp_pre_w2_w; -}; - -struct opt_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - opt_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error("opt_file fail\n"); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - ~opt_file() { - if (fp) { - std::fclose(fp); - } - } -}; -#define _POSIX_MAPPED_FILES -#include -#include - -struct opt_mmap { - void * addr; - size_t size; - - opt_mmap(const opt_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error("mmap failed\n"); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~opt_mmap() { - munmap(addr, size); - } -#else - static constexpr bool SUPPORTED = false; - - opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -struct gpt2_model { - gpt2_hparams hparams; - struct opt_file * file; - struct opt_mmap * mapping; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -struct ggml_context * ctx0 = nullptr; -// std::vector compute_buffer; -void *compute_buffer; - -bool endsWith(const std::string& str, const std::string& suffix) { - if (str.length() < suffix.length()) { - return false; - } - return str.substr(str.length() - suffix.length()) == suffix; -} - - -// load the model's weights from a file -bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); - model.file = new opt_file(fname.c_str(), "rb"); - printf("size %d\n", model.file->size); - model.mapping = new opt_mmap(model.file, 0, false); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - /* int32_t n_vocab = 0; */ - /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ - - /* if (n_vocab != model.hparams.n_vocab) { */ - /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ - /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ - /* return false; */ - /* } */ - int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - printf("wtype %d\n", wtype); - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b - - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b - - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b - - //need refactor - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); - //predictor - ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size = 0; - - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v - - ctx_size += (6 + 12*n_layer)*51200; // object overhead - - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - int main_gpu = 0; -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT -#else -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU -#endif - - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; - // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; - - // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); - // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - // model.lm_head->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["output_norm.weight"] = &model.ln_f_g; - model.tensors["output_norm.bias"] = &model.ln_f_b; - - model.tensors["tok_embeddings.weight"] = &model.wte; - model.tensors["pos_embeddings.weight"] = &model.wpe; - model.tensors["output.weight"] = &model.lm_head; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - memset(&layer, 0, sizeof(gpt2_layer)); - - // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - // // need refine - // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); - // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); - // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); - - // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); - // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); - - // if (i <= 10) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); - // } else if (i <= 12) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); - // } else if (i <= 18) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); - - // } else if (i <= 21) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); - // } else if (i <= 26) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); - // } else if (i <= 31) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); - // } - - // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; - - // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; - // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; - model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; - - model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; - model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; - model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; - model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; - model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; - model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; - - model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; - model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; - model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; - } - } - - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - #ifdef GGML_USE_CUBLAS - // ggml_cuda_assign_buffers_no_scratch(model.memory_k); - // ggml_cuda_assign_buffers_no_scratch(model.memory_v); - #endif - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - ggml_set_no_alloc(ctx, true); - // load weights - { - size_t total_size = 0; - - bool has_lm_head = false; - const std::vector to_gpu = { - "output_norm.bias", - "output_norm.weight", - ".*attention.wq.weight", - ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - // ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - "layers.*output_norm.weight", - "layers.*output_norm.bias", - ".*fc1.weight", - ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_gpu_lv = { - // ".*attention.wq.weight", - // ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - // "layers.*output_norm.weight", - // "layers.*output_norm.bias", - ".*fc1.weight", - ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_lock = { - "tok_embeddings.weight", - "pos_embeddings.weight", - // "output_norm.bias", - ".*attention.wq.weight", - ".*attention.wq.bias", - // ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - // ".*attention.wo.bias", - ".*feed_forward.w1.weight", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_transpose", - // ".*feed_forward.w2.weight", - ".*feed_forward.w2.bias", - ".*gpu.weight", - ".*attention_norm.weight", - ".*attention_norm.bias", - ".*output_norm.weight", - ".*output_norm.bias", - ".*attention.*fc1.weight", - ".*attention.*fc1.bias", - ".*attention.*fc2.weight", - ".*attention.*fc2.bias", - // ".*w2.bias", - // ".*w1.bias", - "output.weight", - }; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - int64_t new_ne[2]; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - new_ne[i] = ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - ggml_tensor ** ptr = model.tensors[name]; - // printf("name %s ptr %p\n", name.c_str(), *ptr); - // int k; - // scanf("%d", &k); - *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); - - auto tensor = (ggml_tensor *)*model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - - // for debugging - if (1) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - std::streampos offset = fin.tellg(); - // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - fin.seekg(ggml_nbytes(tensor), std::ios::cur); - tensor->data = model.mapping->addr + static_cast(offset); - // if ( endsWith(name.c_str(), "weight_transpose")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i+4096]); - // } - // } - // printf("\n"); - // if (endsWith(name.c_str(), "weight_h20")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i]); - - // } - // int k; - // scanf("%d", &k); - // } - - // // GPT-2 models share the WTE tensor as the LM head - // if (name == "model/wte" && has_lm_head == false) { - // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); - // } - - // if (name == "model/lm_head") { - // has_lm_head = true; - // } - if (model_params.low_vram == false) { - for (const auto &s : to_gpu) - { - // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight"))) - // { - // std::regex pattern(R"(\d+)"); - // std::smatch match; - // int layer_id = 0; - // if (std::regex_search(name, match, pattern)) - // { - // std::string digitStr = match.str(); - // int num = std::stoi(digitStr); - // layer_id = num; - // } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - // if (layer_id > model_params.n_gpu_layers) - // break; - // } - if (std::regex_search(name, std::regex(s))) - { - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - } else { - for (const auto &s : to_gpu_lv) - { - if (std::regex_search(name, std::regex(s))) - { - std::regex pattern(R"(\d+)"); - std::smatch match; - int layer_id = 0; - if (std::regex_search(name, match, pattern)) - { - std::string digitStr = match.str(); - int num = std::stoi(digitStr); - layer_id = num; - } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - if (layer_id > model_params.n_gpu_layers) - break; - // printf("name %s\n", name.c_str()); - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - - } - if (tensor->backend == GGML_BACKEND_GPU) { - #if defined(GGML_USE_CUBLAS) - ggml_cuda_transform_tensor(tensor->data, tensor); - #endif - } - for (const auto &s : to_lock) - { - if (std::regex_match(name, std::regex(s))) - { - if(!mlock(tensor->data, ggml_nbytes(tensor))) { - // printf("mlock %s\n", name.c_str()); - } - else { - printf("mlock failed %s\n", name.c_str()); - } - } - } - - total_size += ggml_nbytes(tensor); - } - ggml_set_no_alloc(ctx, false); - - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - } - printf("load finish\n"); - // int k; - // scanf("%d", &k); - - fin.close(); - - return true; -} - -// build the computation graph -struct ggml_cgraph * gpt2_graph( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - - // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data - static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); - // static std::vector buf(buf_size); - static void * buf = ggml_cuda_host_malloc(buf_size); - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - - ctx0 = ggml_init(params); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, embd); - - // avoid writing to tensors if we are only measuring the memory usage - if (!ggml_allocr_is_measure(allocr)) { - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i + 2; - } - } - offload_func_t offload_func = opt_nop; - offload_func_t offload_func_kq = opt_nop; - offload_func_t offload_func_v = opt_nop; - offload_func_t offload_func_nr = opt_nop; - offload_func_t offload_debug = opt_nop; -#ifdef GGML_USE_CUBLAS - offload_debug = ggml_cuda_assign_buffers_no_alloc; - // offload_func = ggml_cuda_assign_buffers_no_alloc; - // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - // offload_func_v = ggml_cuda_assign_buffers_no_alloc; - // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; -#endif - // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; - // int k; - // scanf("%d", &k); - - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); - ggml_set_name(inpL, "inpL_first"); - // offload_func(inpL); - - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); - offload_func(cur); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_1_g); - offload_func(cur); - ggml_set_name(cur, "ln_1_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_1_b); - ggml_set_name(cur, "ln_1_b"); - // offload_func(cur); - - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - - struct ggml_tensor *k_cpy = nullptr; - struct ggml_tensor *v_cpy = nullptr; - // self-attention - { - // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); - offload_func_kq(Qcur); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); - offload_func_kq(Kcur); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); - offload_func_kq(Kcur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); - offload_func_v(Vcur); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); - offload_func_v(Vcur); - - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - offload_func_v(Vcur); - - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - offload_func_kq(k); - // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); - - offload_func_v(v); - k_cpy = ggml_cpy(ctx0, Kcur, k); - offload_func_kq(k_cpy); - ggml_set_name(k_cpy, "k_cpy"); - v_cpy = ggml_cpy(ctx0, Vcur, v); - offload_func_v(v_cpy); - ggml_set_name(v_cpy, "v_cpy"); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); - offload_func_kq(Qcur); - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - offload_func_kq(Q); - - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - // struct ggml_tensor * K = - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 0, 2, 1, 3); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, model.memory_k, - 128, n_past + N, n_head, - ggml_element_size(model.memory_k)*n_embd, - ggml_element_size(model.memory_k)*128, - ggml_element_size(model.memory_k)*n_embd*n_ctx*il); - K->src[1] = k_cpy; - offload_func_kq(K); - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - KQ_scale); - offload_func_kq(KQ_scaled); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, 128, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*128, - n_ctx*ggml_element_size(model.memory_k)*n_embd*il); - V->src[1] = v_cpy; - offload_func_v(V); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merge_cont"); - offload_func_v(cur); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - ggml_set_name(cur, "attn_proj"); - offload_func(cur); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_attn_proj_b); - ggml_set_name(cur, "attn_bias"); - offload_func(cur); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - ggml_set_name(cur, "after attn"); - - struct ggml_tensor * inpFF = cur; - - // feed-forward network - { - ggml_tensor *idx = nullptr; - ggml_tensor *idx_g = nullptr; - ggml_tensor *cur_c = nullptr; - - // norm - { - cur = ggml_norm(ctx0, inpFF, hparams.eps); - offload_func(cur); - ggml_set_name(cur, "norm_FFN"); - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_2_g); - offload_func(cur); - ggml_set_name(cur, "norm_FFN_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_2_b); - // offload_func(cur); - // ggml_set_name(cur, "norm_FFN_w"); - // cur_c = ggml_dup(ctx0, cur); - } - // if (N == 1) - if (1) - { - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w1_w, - inpFF); - offload_func(idx); - ggml_set_name(idx, "mlp_pre_w1"); - idx = ggml_relu(ctx0, idx); - offload_func(idx); - ggml_set_name(idx, "relu_pre"); - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w2_w, - idx); - ggml_set_name(idx, "mlp_pre_w2"); - // offload_func(idx); - // idx = ggml_sigmoid(ctx0, idx); - // offload_func(idx); - // idx_g = idx; - // idx = ggml_dup(ctx0, idx_g); - // ggml_set_name(idx, "idx_cpu_dup"); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - if (N >= 80) - // if (0) - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_fc_w, - cur); - offload_debug(cur); - offload_func(cur); - ggml_set_name(cur, "up_ffn"); - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_fc_b); - offload_debug(cur); - offload_func(cur); - } - else - { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_fc_w, - // cur); - // offload_func(cur); - // cur = ggml_add(ctx0, - // cur, - // model.layers[il].c_mlp_fc_b); - // offload_func(cur); - - - struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, - model.layers[il].c_mlp_fc_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "mlp_up_gpu"); - offload_func(tmp); - offload_debug(tmp); - cur = ggml_mul_mat_idx(ctx0, - model.layers[il].c_mlp_fc_w, - cur, - idx, - model.layers[il].gpu_idx); - ggml_set_name(cur, "mlp_up_cpu"); - cur = ggml_add_idx(ctx0, - cur, - model.layers[il].c_mlp_fc_b, - idx); - ggml_set_name(tmp, "mlp_up_bias"); - offload_debug(tmp); - offload_func(tmp); - - cur = ggml_add(ctx0, cur, tmp); - ggml_set_name(cur, "mlp_up_mix"); - offload_func(cur); - - // cur = tmp; - - } - - - - // GELU activation - // [3072, N] - cur = ggml_relu(ctx0, cur); - // cur_c = cur; - // offload_func(cur); - cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - if (N >= 80) { - // if (0) { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_proj_w, - // cur); - cur = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_t, - cur, - NULL, - NULL); - offload_debug(cur); - offload_func(cur); - ggml_set_name(cur, "down_ffn"); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_proj_b); - offload_func(cur); - offload_debug(cur); - } - else { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_proj_w, - // cur); - // offload_func(cur); - - // cur = ggml_axpy(ctx0, - // model.layers[il].c_mlp_proj_w_t, - // cur, - // NULL, - // NULL); - // offload_func(cur); - - - // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, - // model.layers[il].c_mlp_proj_w_gpu, - // cur, - // model.layers[il].gpu_bucket, - // NULL); - struct ggml_tensor *tmp = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "axpy"); - offload_func(tmp); - offload_debug(tmp); - cur = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_t, - cur_c, - idx, - model.layers[il].gpu_idx); - - cur = ggml_add(ctx0, cur, tmp); - offload_func(cur); - - cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); - offload_func(cur); - - // tmp = ggml_add(ctx0, - // tmp, - // model.layers[il].c_mlp_proj_b); - // offload_func(tmp); - // offload_debug(tmp); - - // cur = tmp; - } - - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - offload_func(inpL); - } - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); - offload_func_nr(inpL); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_mul(ctx0, - inpL, - model.ln_f_g); - offload_func_nr(inpL); - inpL = ggml_add(ctx0, - inpL, - model.ln_f_b); - ggml_set_name(inpL, "before"); - offload_func_nr(inpL); - } - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - ggml_set_name(inpL, "last_layer"); -// offload_func(inpL); - - // logits -> probs - //inpL = ggml_soft_max(ctx0, inpL); - - ggml_build_forward_expand(gf, inpL); - - ggml_free(ctx0); - - return gf; -} - -// evaluate the transformer -// -// - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt2_eval( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_vocab = hparams.n_vocab; - - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); - - // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); - -#ifdef GGML_USE_CUBLAS - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } -#endif - - - - // run the computation - struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); - static std::vector work_buffer; - work_buffer.resize(plan.work_size); - plan.work_data = work_buffer.data(); - ggml_graph_compute(gf, &plan); - - //if (n_past%100 == 0) { - // ggml_graph_print (gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result just for the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/gpt-2-117M/ggml-model.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gpt2_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_model_load(params.model, model, vocab, params)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, "hello world"); - } - printf("load finish\n"); - - // keep this buffer alive while evaluating the model - - struct ggml_allocr * allocr = NULL; - // allocate the compute buffer - { - allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); - - // create the worst case graph for memory usage estimation - int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); - int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - // compute_buffer.resize(mem_size); - compute_buffer = ggml_cuda_host_malloc(mem_size); - // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); - allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); - - fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); - for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { - printf("%d ", embd_inp[i]); - } - printf("\n\n"); - - // submit the input prompt token-by-token - // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning - std::vector embd; - - int cnt = 0; - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { - printf("Failed to predict\n"); - return 1; - } - cnt += 1; - - if (cnt > 0) - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) >= params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gpt-2-sparse/main.cpp_bak b/examples/gpt-2-sparse/main.cpp_bak deleted file mode 100644 index e1e9d58e..00000000 --- a/examples/gpt-2-sparse/main.cpp_bak +++ /dev/null @@ -1,1546 +0,0 @@ -#include "ggml.h" -#include "ggml-alloc.h" -#include - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ggml-cuda.h" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif -typedef void (*offload_func_t)(struct ggml_tensor * tensor); -void opt_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; -} -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct gpt2_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - // struct ggml_tensor * c_attn_attn_w; - // struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_attn_q_w; - struct ggml_tensor * c_attn_attn_q_b; - - struct ggml_tensor * c_attn_attn_k_w; - struct ggml_tensor * c_attn_attn_k_b; - - struct ggml_tensor * c_attn_attn_v_w; - struct ggml_tensor * c_attn_attn_v_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; - - struct ggml_tensor * gpu_idx; - struct ggml_tensor * gpu_bucket; - // gpu heat - struct ggml_tensor * c_mlp_fc_w_gpu; - struct ggml_tensor * c_mlp_proj_w_t; - struct ggml_tensor * c_mlp_proj_w_gpu; - - //predictor - struct ggml_tensor * mlp_pre_w1_w; - struct ggml_tensor * mlp_pre_w2_w; -}; - -struct opt_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - opt_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error("opt_file fail\n"); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - ~opt_file() { - if (fp) { - std::fclose(fp); - } - } -}; -#define _POSIX_MAPPED_FILES -#include -#include - -struct opt_mmap { - void * addr; - size_t size; - - opt_mmap(const opt_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error("mmap failed\n"); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~opt_mmap() { - munmap(addr, size); - } -#else - static constexpr bool SUPPORTED = false; - - opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -struct gpt2_model { - gpt2_hparams hparams; - struct opt_file * file; - struct opt_mmap * mapping; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -struct ggml_context * ctx0 = nullptr; -// std::vector compute_buffer; -void *compute_buffer; - -// load the model's weights from a file -bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); - model.file = new opt_file(fname.c_str(), "rb"); - printf("size %d\n", model.file->size); - model.mapping = new opt_mmap(model.file, 0, false); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - /* int32_t n_vocab = 0; */ - /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ - - /* if (n_vocab != model.hparams.n_vocab) { */ - /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ - /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ - /* return false; */ - /* } */ - int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - printf("wtype %d\n", wtype); - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b - - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b - - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b - - //need refactor - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); - //predictor - ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size = 0; - - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v - - ctx_size += (6 + 12*n_layer)*51200; // object overhead - - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - int main_gpu = 0; -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT -#else -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU -#endif - - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; - // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; - - // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); - // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - // model.lm_head->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["output_norm.weight"] = &model.ln_f_g; - model.tensors["output_norm.bias"] = &model.ln_f_b; - - model.tensors["tok_embeddings.weight"] = &model.wte; - model.tensors["pos_embeddings.weight"] = &model.wpe; - model.tensors["output.weight"] = &model.lm_head; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - memset(&layer, 0, sizeof(gpt2_layer)); - - // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - // // need refine - // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); - // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); - // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); - - // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); - // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); - - // if (i <= 10) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); - // } else if (i <= 12) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); - // } else if (i <= 18) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); - - // } else if (i <= 21) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); - // } else if (i <= 26) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); - // } else if (i <= 31) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); - // } - - // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; - - // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; - // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; - model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; - - model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; - model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; - model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; - model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; - model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; - model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; - - model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; - model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; - model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; - } - } - - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - #ifdef GGML_USE_CUBLAS - // ggml_cuda_assign_buffers_no_scratch(model.memory_k); - // ggml_cuda_assign_buffers_no_scratch(model.memory_v); - #endif - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - ggml_set_no_alloc(ctx, true); - // load weights - { - size_t total_size = 0; - - bool has_lm_head = false; - const std::vector to_gpu = { - "output_norm.bias", - "output_norm.weight", - // ".*attention.wq.weight", - // ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - // ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - // ".*feed_forward.w1.weight$", - ".*feed_forward.w1.bias", - // ".*feed_forward.w2.weight_h20$", - ".*feed_forward.w2.weight_transpose", - // ".*feed_forward.w2.weight$", - /* ".*feed_forward.w2.weight$", */ - ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - "layers.*output_norm.weight", - "layers.*output_norm.bias", - ".*fc1.weight", - ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_gpu_lv = { - ".*attention.wq.weight", - ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - // "layers.*output_norm.weight", - // "layers.*output_norm.bias", - // ".*fc1.weight", - // ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_lock = { - "tok_embeddings.weight", - "pos_embeddings.weight", - // "output_norm.bias", - ".*attention.wq.weight", - ".*attention.wq.bias", - // ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - // ".*attention.wo.bias", - ".*feed_forward.w1.weight", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_transpose", - // ".*feed_forward.w2.weight", - ".*feed_forward.w2.bias", - ".*gpu.weight", - ".*attention_norm.weight", - ".*attention_norm.bias", - ".*output_norm.weight", - ".*output_norm.bias", - ".*attention.*fc1.weight", - ".*attention.*fc1.bias", - ".*attention.*fc2.weight", - ".*attention.*fc2.bias", - // ".*w2.bias", - // ".*w1.bias", - "output.weight", - }; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - int64_t new_ne[2]; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - new_ne[i] = ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - ggml_tensor ** ptr = model.tensors[name]; - // printf("name %s ptr %p\n", name.c_str(), *ptr); - // int k; - // scanf("%d", &k); - *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); - - auto tensor = (ggml_tensor *)*model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - std::streampos offset = fin.tellg(); - // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - fin.seekg(ggml_nbytes(tensor), std::ios::cur); - tensor->data = model.mapping->addr + static_cast(offset); - - // // GPT-2 models share the WTE tensor as the LM head - // if (name == "model/wte" && has_lm_head == false) { - // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); - // } - - // if (name == "model/lm_head") { - // has_lm_head = true; - // } - if (model_params.low_vram == false) { - for (const auto &s : to_gpu) - { - if (std::regex_search(name, std::regex(s))) - { - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - } else { - for (const auto &s : to_gpu_lv) - { - if (std::regex_search(name, std::regex(s))) - { - std::regex pattern(R"(\d+)"); - std::smatch match; - int layer_id = 0; - if (std::regex_search(name, match, pattern)) - { - std::string digitStr = match.str(); - int num = std::stoi(digitStr); - layer_id = num; - } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - if (layer_id > model_params.n_gpu_layers) - break; - // printf("name %s\n", name.c_str()); - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - - } - if (tensor->backend == GGML_BACKEND_GPU) { - #if defined(GGML_USE_CUBLAS) - ggml_cuda_transform_tensor(tensor->data, tensor); - #endif - } - for (const auto &s : to_lock) - { - if (std::regex_match(name, std::regex(s))) - { - if(!mlock(tensor->data, ggml_nbytes(tensor))) { - // printf("mlock %s\n", name.c_str()); - } - else { - printf("mlock failed %s\n", name.c_str()); - } - } - } - - total_size += ggml_nbytes(tensor); - } - ggml_set_no_alloc(ctx, false); - - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - } - - fin.close(); - - return true; -} - -// build the computation graph -struct ggml_cgraph * gpt2_graph( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - - // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data - static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); - // static std::vector buf(buf_size); - static void * buf = ggml_cuda_host_malloc(buf_size); - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - - ctx0 = ggml_init(params); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, embd); - - // avoid writing to tensors if we are only measuring the memory usage - if (!ggml_allocr_is_measure(allocr)) { - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i + 2; - } - } - offload_func_t offload_func = opt_nop; - offload_func_t offload_func_kq = opt_nop; - offload_func_t offload_func_v = opt_nop; - offload_func_t offload_func_nr = opt_nop; - offload_func_t offload_debug = opt_nop; -#ifdef GGML_USE_CUBLAS - // offload_debug = ggml_cuda_assign_buffers_no_alloc; - // offload_func = ggml_cuda_assign_buffers_no_alloc; - // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - // offload_func_v = ggml_cuda_assign_buffers_no_alloc; - // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; -#endif - // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; - // int k; - // scanf("%d", &k); - - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); - ggml_set_name(inpL, "inpL_first"); - // offload_func(inpL); - - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); - offload_func(cur); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_1_g); - offload_func(cur); - ggml_set_name(cur, "ln_1_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_1_b); - ggml_set_name(cur, "ln_1_b"); - // offload_func(cur); - - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - - struct ggml_tensor *k_cpy = nullptr; - struct ggml_tensor *v_cpy = nullptr; - // self-attention - { - // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); - offload_func_kq(Qcur); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); - offload_func_kq(Kcur); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); - offload_func_kq(Kcur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); - offload_func_v(Vcur); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); - offload_func_v(Vcur); - - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - offload_func_v(Vcur); - - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - offload_func_kq(k); - // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); - - offload_func_v(v); - k_cpy = ggml_cpy(ctx0, Kcur, k); - offload_func_kq(k_cpy); - ggml_set_name(k_cpy, "k_cpy"); - v_cpy = ggml_cpy(ctx0, Vcur, v); - offload_func_v(v_cpy); - ggml_set_name(v_cpy, "v_cpy"); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); - offload_func_kq(Qcur); - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - offload_func_kq(Q); - - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - // struct ggml_tensor * K = - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 0, 2, 1, 3); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, model.memory_k, - 128, n_past + N, n_head, - ggml_element_size(model.memory_k)*n_embd, - ggml_element_size(model.memory_k)*128, - ggml_element_size(model.memory_k)*n_embd*n_ctx*il); - K->src[1] = k_cpy; - offload_func_kq(K); - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - KQ_scale); - offload_func_kq(KQ_scaled); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, 128, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*128, - n_ctx*ggml_element_size(model.memory_k)*n_embd*il); - V->src[1] = v_cpy; - offload_func_v(V); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merge_cont"); - offload_func_v(cur); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - ggml_set_name(cur, "attn_proj"); - offload_func(cur); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_attn_proj_b); - ggml_set_name(cur, "attn_bias"); - offload_func(cur); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - ggml_set_name(cur, "after attn"); - - struct ggml_tensor * inpFF = cur; - - // feed-forward network - { - ggml_tensor *idx = nullptr; - ggml_tensor *idx_g = nullptr; - ggml_tensor *cur_c = nullptr; - - // norm - { - cur = ggml_norm(ctx0, inpFF, hparams.eps); - offload_func(cur); - ggml_set_name(cur, "norm_FFN"); - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_2_g); - offload_func(cur); - ggml_set_name(cur, "norm_FFN_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_2_b); - // offload_func(cur); - // ggml_set_name(cur, "norm_FFN_w"); - // cur_c = ggml_dup(ctx0, cur); - } - // if (N == 1) - if (1) - { - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w1_w, - inpFF); - offload_func(idx); - ggml_set_name(idx, "mlp_pre_w1"); - idx = ggml_relu(ctx0, idx); - offload_func(idx); - ggml_set_name(idx, "relu_pre"); - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w2_w, - idx); - ggml_set_name(idx, "mlp_pre_w2"); - // offload_func(idx); - // idx = ggml_sigmoid(ctx0, idx); - // offload_func(idx); - // idx_g = idx; - // idx = ggml_dup(ctx0, idx_g); - // ggml_set_name(idx, "idx_cpu_dup"); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - if (N != 1) - // if (0) - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_fc_w, - cur); - offload_func(cur); - ggml_set_name(cur, "up_ffn"); - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_fc_b); - offload_func(cur); - } - else - { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_fc_w, - // cur); - // offload_func(cur); - // cur = ggml_add(ctx0, - // cur, - // model.layers[il].c_mlp_fc_b); - // offload_func(cur); - - - struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, - model.layers[il].c_mlp_fc_w_gpu, - // model.layers[il].c_mlp_fc_w, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "mlp_up_gpu"); - offload_func(tmp); - offload_debug(tmp); - cur = ggml_mul_mat_idx(ctx0, - model.layers[il].c_mlp_fc_w, - cur, - idx, - model.layers[il].gpu_idx); - ggml_set_name(cur, "mlp_up_cpu"); - - // cur = ggml_add_idx(ctx0, - // cur, - // model.layers[il].c_mlp_fc_b, - // idx); - // offload_func(cur); - tmp = ggml_add_idx(ctx0, - tmp, - model.layers[il].c_mlp_fc_b, - idx); - offload_debug(tmp); - - - // cur = ggml_add(ctx0, cur, tmp); - // ggml_set_name(cur, "mlp_up_mix"); - // offload_func(cur); - - // cur = tmp; - - } - - - - // GELU activation - // [3072, N] - cur = ggml_relu(ctx0, cur); - // cur_c = cur; - // offload_func(cur); - cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - // if (N != 1) { - if (0) { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_proj_w, - cur); - offload_func(cur); - ggml_set_name(cur, "down_ffn"); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_proj_b); - offload_func(cur); - } - else { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_proj_w, - // cur); - // offload_func(cur); - - // cur = ggml_axpy(ctx0, - // model.layers[il].c_mlp_proj_w_t, - // cur, - // NULL, - // NULL); - // offload_func(cur); - - - //here - // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, - // model.layers[il].c_mlp_proj_w_gpu, - // cur, - // model.layers[il].gpu_bucket, - // NULL); - // ggml_set_name(tmp, "axpy"); - // offload_func(tmp); - // offload_debug(tmp); - cur = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_t, - cur_c, - // NULL, - // NULL); - idx, - model.layers[il].gpu_bucket); - // model.layers[il].gpu_idx); - // offload_func(cur); - - cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); - offload_func(cur); - - // tmp = ggml_add(ctx0, - // tmp, - // model.layers[il].c_mlp_proj_b); - // offload_func(tmp); - // offload_debug(tmp); - - // cur = ggml_add(ctx0, cur, tmp); - // offload_func(cur); - } - - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - offload_func(inpL); - } - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); - offload_func_nr(inpL); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_mul(ctx0, - inpL, - model.ln_f_g); - offload_func_nr(inpL); - inpL = ggml_add(ctx0, - inpL, - model.ln_f_b); - ggml_set_name(inpL, "before"); - offload_func_nr(inpL); - } - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - ggml_set_name(inpL, "last_layer"); -// offload_func(inpL); - - // logits -> probs - //inpL = ggml_soft_max(ctx0, inpL); - - ggml_build_forward_expand(gf, inpL); - - ggml_free(ctx0); - - return gf; -} - -// evaluate the transformer -// -// - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt2_eval( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_vocab = hparams.n_vocab; - - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); - - // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); - -#ifdef GGML_USE_CUBLAS - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } -#endif - - - - // run the computation - struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); - static std::vector work_buffer; - work_buffer.resize(plan.work_size); - plan.work_data = work_buffer.data(); - ggml_graph_compute(gf, &plan); - - //if (n_past%100 == 0) { - // ggml_graph_print (gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result just for the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/gpt-2-117M/ggml-model.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gpt2_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_model_load(params.model, model, vocab, params)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, "hello world"); - } - printf("load finish\n"); - - // keep this buffer alive while evaluating the model - - struct ggml_allocr * allocr = NULL; - // allocate the compute buffer - { - allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); - - // create the worst case graph for memory usage estimation - int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); - int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - // compute_buffer.resize(mem_size); - compute_buffer = ggml_cuda_host_malloc(mem_size); - // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); - allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); - - fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); - for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { - printf("%d ", embd_inp[i]); - } - printf("\n\n"); - - // submit the input prompt token-by-token - // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning - std::vector embd; - - int cnt = 0; - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { - printf("Failed to predict\n"); - return 1; - } - cnt += 1; - - if (cnt > 0) - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) >= params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gpt-2-sparse/main13b.cpp b/examples/gpt-2-sparse/main13b.cpp deleted file mode 100644 index 0681da3e..00000000 --- a/examples/gpt-2-sparse/main13b.cpp +++ /dev/null @@ -1,1583 +0,0 @@ -#include "ggml.h" -#include "ggml-alloc.h" -#include - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ggml-cuda.h" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif -typedef void (*offload_func_t)(struct ggml_tensor * tensor); -void opt_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; -} -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct gpt2_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - // struct ggml_tensor * c_attn_attn_w; - // struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_attn_q_w; - struct ggml_tensor * c_attn_attn_q_b; - - struct ggml_tensor * c_attn_attn_k_w; - struct ggml_tensor * c_attn_attn_k_b; - - struct ggml_tensor * c_attn_attn_v_w; - struct ggml_tensor * c_attn_attn_v_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; - - struct ggml_tensor * gpu_idx; - struct ggml_tensor * gpu_bucket; - // gpu heat - struct ggml_tensor * c_mlp_fc_w_gpu; - struct ggml_tensor * c_mlp_proj_w_t; - struct ggml_tensor * c_mlp_proj_w_gpu; - - //predictor - struct ggml_tensor * mlp_pre_w1_w; - struct ggml_tensor * mlp_pre_w2_w; -}; - -struct opt_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - opt_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error("opt_file fail\n"); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - ~opt_file() { - if (fp) { - std::fclose(fp); - } - } -}; -#define _POSIX_MAPPED_FILES -#include -#include - -struct opt_mmap { - void * addr; - size_t size; - - opt_mmap(const opt_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error("mmap failed\n"); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~opt_mmap() { - munmap(addr, size); - } -#else - static constexpr bool SUPPORTED = false; - - opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -struct gpt2_model { - gpt2_hparams hparams; - struct opt_file * file; - struct opt_mmap * mapping; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -struct ggml_context * ctx0 = nullptr; -// std::vector compute_buffer; -void *compute_buffer; - -bool endsWith(const std::string& str, const std::string& suffix) { - if (str.length() < suffix.length()) { - return false; - } - return str.substr(str.length() - suffix.length()) == suffix; -} - - -// load the model's weights from a file -bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); - model.file = new opt_file(fname.c_str(), "rb"); - printf("size %d\n", model.file->size); - model.mapping = new opt_mmap(model.file, 0, false); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - /* int32_t n_vocab = 0; */ - /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ - - /* if (n_vocab != model.hparams.n_vocab) { */ - /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ - /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ - /* return false; */ - /* } */ - int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - printf("wtype %d\n", wtype); - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b - - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b - - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b - - //need refactor - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); - //predictor - ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size = 0; - - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v - - ctx_size += (6 + 12*n_layer)*51200; // object overhead - - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - int main_gpu = 0; -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT -#else -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU -#endif - - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; - // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; - - // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); - // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - // model.lm_head->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["output_norm.weight"] = &model.ln_f_g; - model.tensors["output_norm.bias"] = &model.ln_f_b; - - model.tensors["tok_embeddings.weight"] = &model.wte; - model.tensors["pos_embeddings.weight"] = &model.wpe; - model.tensors["output.weight"] = &model.lm_head; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - memset(&layer, 0, sizeof(gpt2_layer)); - - // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - // // need refine - // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); - // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); - // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); - - // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); - // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); - - // if (i <= 10) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); - // } else if (i <= 12) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); - // } else if (i <= 18) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); - - // } else if (i <= 21) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); - // } else if (i <= 26) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); - // } else if (i <= 31) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); - // } - - // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; - - // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; - // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; - model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; - - model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; - model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; - model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; - model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; - model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; - model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; - - model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; - model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; - model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; - } - } - - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - #ifdef GGML_USE_CUBLAS - // ggml_cuda_assign_buffers_no_scratch(model.memory_k); - // ggml_cuda_assign_buffers_no_scratch(model.memory_v); - #endif - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - ggml_set_no_alloc(ctx, true); - // load weights - { - size_t total_size = 0; - - bool has_lm_head = false; - const std::vector to_gpu = { - "output_norm.bias", - "output_norm.weight", - ".*attention.wq.weight", - ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - // ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - "layers.*output_norm.weight", - "layers.*output_norm.bias", - ".*fc1.weight", - ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_gpu_lv = { - ".*attention.wq.weight", - ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - // "layers.*output_norm.weight", - // "layers.*output_norm.bias", - // ".*fc1.weight", - // ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_lock = { - "tok_embeddings.weight", - "pos_embeddings.weight", - // "output_norm.bias", - ".*attention.wq.weight", - ".*attention.wq.bias", - // ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - // ".*attention.wo.bias", - ".*feed_forward.w1.weight", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_transpose", - // ".*feed_forward.w2.weight", - ".*feed_forward.w2.bias", - ".*gpu.weight", - ".*attention_norm.weight", - ".*attention_norm.bias", - ".*output_norm.weight", - ".*output_norm.bias", - ".*attention.*fc1.weight", - ".*attention.*fc1.bias", - ".*attention.*fc2.weight", - ".*attention.*fc2.bias", - // ".*w2.bias", - // ".*w1.bias", - "output.weight", - }; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - int64_t new_ne[2]; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - new_ne[i] = ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - ggml_tensor ** ptr = model.tensors[name]; - // printf("name %s ptr %p\n", name.c_str(), *ptr); - // int k; - // scanf("%d", &k); - *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); - - auto tensor = (ggml_tensor *)*model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - std::streampos offset = fin.tellg(); - // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - fin.seekg(ggml_nbytes(tensor), std::ios::cur); - tensor->data = model.mapping->addr + static_cast(offset); - // if ( endsWith(name.c_str(), "weight_transpose")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i+4096]); - // } - // } - // printf("\n"); - // if (endsWith(name.c_str(), "weight_h20")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i]); - - // } - // int k; - // scanf("%d", &k); - // } - - // // GPT-2 models share the WTE tensor as the LM head - // if (name == "model/wte" && has_lm_head == false) { - // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); - // } - - // if (name == "model/lm_head") { - // has_lm_head = true; - // } - if (model_params.low_vram == false) { - for (const auto &s : to_gpu) - { - // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight"))) - // { - // std::regex pattern(R"(\d+)"); - // std::smatch match; - // int layer_id = 0; - // if (std::regex_search(name, match, pattern)) - // { - // std::string digitStr = match.str(); - // int num = std::stoi(digitStr); - // layer_id = num; - // } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - // if (layer_id > model_params.n_gpu_layers) - // break; - // } - // printf("name %s\n", name.c_str()); - if (std::regex_search(name, std::regex(s))) - { - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - } else { - for (const auto &s : to_gpu_lv) - { - if (std::regex_search(name, std::regex(s))) - { - std::regex pattern(R"(\d+)"); - std::smatch match; - int layer_id = 0; - if (std::regex_search(name, match, pattern)) - { - std::string digitStr = match.str(); - int num = std::stoi(digitStr); - layer_id = num; - } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - if (layer_id > model_params.n_gpu_layers) - break; - // printf("name %s\n", name.c_str()); - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - - } - if (tensor->backend == GGML_BACKEND_GPU) { - #if defined(GGML_USE_CUBLAS) - ggml_cuda_transform_tensor(tensor->data, tensor); - #endif - } - for (const auto &s : to_lock) - { - if (std::regex_match(name, std::regex(s))) - { - if(!mlock(tensor->data, ggml_nbytes(tensor))) { - // printf("mlock %s\n", name.c_str()); - } - else { - printf("mlock failed %s\n", name.c_str()); - } - } - } - - total_size += ggml_nbytes(tensor); - } - ggml_set_no_alloc(ctx, false); - - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - } - - fin.close(); - - return true; -} - -// build the computation graph -struct ggml_cgraph * gpt2_graph( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - - // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data - static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); - // static std::vector buf(buf_size); - static void * buf = ggml_cuda_host_malloc(buf_size); - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - - ctx0 = ggml_init(params); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, embd); - - // avoid writing to tensors if we are only measuring the memory usage - if (!ggml_allocr_is_measure(allocr)) { - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i + 2; - } - } - offload_func_t offload_func = opt_nop; - offload_func_t offload_func_kq = opt_nop; - offload_func_t offload_func_v = opt_nop; - offload_func_t offload_func_nr = opt_nop; - offload_func_t offload_debug = opt_nop; -#ifdef GGML_USE_CUBLAS - offload_debug = ggml_cuda_assign_buffers_no_alloc; - // offload_func = ggml_cuda_assign_buffers_no_alloc; - // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - // offload_func_v = ggml_cuda_assign_buffers_no_alloc; - // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; -#endif - // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; - // int k; - // scanf("%d", &k); - - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); - ggml_set_name(inpL, "inpL_first"); - // offload_func(inpL); - - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); - offload_func(cur); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_1_g); - offload_func(cur); - ggml_set_name(cur, "ln_1_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_1_b); - ggml_set_name(cur, "ln_1_b"); - // offload_func(cur); - - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - - struct ggml_tensor *k_cpy = nullptr; - struct ggml_tensor *v_cpy = nullptr; - // self-attention - { - // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); - offload_func_kq(Qcur); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); - offload_func_kq(Kcur); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); - offload_func_kq(Kcur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); - offload_func_v(Vcur); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); - offload_func_v(Vcur); - - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - offload_func_v(Vcur); - - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - offload_func_kq(k); - // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); - - offload_func_v(v); - k_cpy = ggml_cpy(ctx0, Kcur, k); - offload_func_kq(k_cpy); - ggml_set_name(k_cpy, "k_cpy"); - v_cpy = ggml_cpy(ctx0, Vcur, v); - offload_func_v(v_cpy); - ggml_set_name(v_cpy, "v_cpy"); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); - offload_func_kq(Qcur); - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - offload_func_kq(Q); - - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - // struct ggml_tensor * K = - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 0, 2, 1, 3); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, model.memory_k, - 128, n_past + N, n_head, - ggml_element_size(model.memory_k)*n_embd, - ggml_element_size(model.memory_k)*128, - ggml_element_size(model.memory_k)*n_embd*n_ctx*il); - K->src[1] = k_cpy; - offload_func_kq(K); - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - KQ_scale); - offload_func_kq(KQ_scaled); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, 128, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*128, - n_ctx*ggml_element_size(model.memory_k)*n_embd*il); - V->src[1] = v_cpy; - offload_func_v(V); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merge_cont"); - offload_func_v(cur); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - ggml_set_name(cur, "attn_proj"); - offload_func(cur); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_attn_proj_b); - ggml_set_name(cur, "attn_bias"); - offload_func(cur); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - ggml_set_name(cur, "after attn"); - - struct ggml_tensor * inpFF = cur; - - // feed-forward network - { - ggml_tensor *idx = nullptr; - ggml_tensor *idx_g = nullptr; - ggml_tensor *cur_c = nullptr; - - // norm - { - cur = ggml_norm(ctx0, inpFF, hparams.eps); - offload_func(cur); - ggml_set_name(cur, "norm_FFN"); - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_2_g); - offload_func(cur); - ggml_set_name(cur, "norm_FFN_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_2_b); - // offload_func(cur); - // ggml_set_name(cur, "norm_FFN_w"); - // cur_c = ggml_dup(ctx0, cur); - } - // if (N == 1) - if (1) - { - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w1_w, - cur); - offload_func(idx); - ggml_set_name(idx, "mlp_pre_w1"); - idx = ggml_relu(ctx0, idx); - offload_func(idx); - ggml_set_name(idx, "relu_pre"); - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w2_w, - idx); - ggml_set_name(idx, "mlp_pre_w2"); - // offload_func(idx); - // idx = ggml_sigmoid(ctx0, idx); - // offload_func(idx); - // idx_g = idx; - // idx = ggml_dup(ctx0, idx_g); - // ggml_set_name(idx, "idx_cpu_dup"); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - // if (N != 1) - if (0) - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_fc_w, - cur); - offload_func(cur); - ggml_set_name(cur, "up_ffn"); - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_fc_b); - offload_func(cur); - } - else - { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_fc_w, - // cur); - // offload_func(cur); - // cur = ggml_add(ctx0, - // cur, - // model.layers[il].c_mlp_fc_b); - // offload_func(cur); - - - struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, - model.layers[il].c_mlp_fc_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "mlp_up_gpu"); - offload_func(tmp); - offload_debug(tmp); - cur = ggml_mul_mat_idx(ctx0, - model.layers[il].c_mlp_fc_w, - cur, - idx, - model.layers[il].gpu_idx); - ggml_set_name(cur, "mlp_up_cpu"); - tmp = ggml_add_idx(ctx0, - tmp, - model.layers[il].c_mlp_fc_b, - idx); - ggml_set_name(tmp, "mlp_up_bias"); - offload_debug(tmp); - offload_func(tmp); - - cur = ggml_add(ctx0, cur, tmp); - ggml_set_name(cur, "mlp_up_mix"); - offload_func(cur); - - // cur = tmp; - - } - - - - // GELU activation - // [3072, N] - cur = ggml_relu(ctx0, cur); - // cur_c = cur; - // offload_func(cur); - cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - // if (N != 1) { - if (0) { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_proj_w, - cur); - offload_func(cur); - ggml_set_name(cur, "down_ffn"); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_proj_b); - offload_func(cur); - } - else { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_proj_w, - // cur); - // offload_func(cur); - - // cur = ggml_axpy(ctx0, - // model.layers[il].c_mlp_proj_w_t, - // cur, - // NULL, - // NULL); - // offload_func(cur); - - - // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, - // model.layers[il].c_mlp_proj_w_gpu, - // cur, - // model.layers[il].gpu_bucket, - // NULL); - struct ggml_tensor *tmp = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "axpy"); - offload_func(tmp); - offload_debug(tmp); - - cur = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_t, - cur_c, - idx, - model.layers[il].gpu_idx); - - cur = ggml_add(ctx0, cur, tmp); - offload_func(cur); - - cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); - offload_func(cur); - - // tmp = ggml_add(ctx0, - // tmp, - // model.layers[il].c_mlp_proj_b); - // offload_func(tmp); - // offload_debug(tmp); - - // cur = tmp; - } - - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - offload_func(inpL); - } - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); - offload_func_nr(inpL); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_mul(ctx0, - inpL, - model.ln_f_g); - offload_func_nr(inpL); - inpL = ggml_add(ctx0, - inpL, - model.ln_f_b); - ggml_set_name(inpL, "before"); - offload_func_nr(inpL); - } - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - ggml_set_name(inpL, "last_layer"); -// offload_func(inpL); - - // logits -> probs - //inpL = ggml_soft_max(ctx0, inpL); - - ggml_build_forward_expand(gf, inpL); - - ggml_free(ctx0); - - return gf; -} - -// evaluate the transformer -// -// - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt2_eval( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_vocab = hparams.n_vocab; - - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); - - // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); - -#ifdef GGML_USE_CUBLAS - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } -#endif - - - - // run the computation - struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); - static std::vector work_buffer; - work_buffer.resize(plan.work_size); - plan.work_data = work_buffer.data(); - ggml_graph_compute(gf, &plan); - - //if (n_past%100 == 0) { - // ggml_graph_print (gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result just for the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/gpt-2-117M/ggml-model.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gpt2_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_model_load(params.model, model, vocab, params)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, "hello world"); - } - printf("load finish\n"); - - // keep this buffer alive while evaluating the model - - struct ggml_allocr * allocr = NULL; - // allocate the compute buffer - { - allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); - - // create the worst case graph for memory usage estimation - int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); - int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - // compute_buffer.resize(mem_size); - compute_buffer = ggml_cuda_host_malloc(mem_size); - // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); - allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); - - fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); - for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { - printf("%d ", embd_inp[i]); - } - printf("\n\n"); - - // submit the input prompt token-by-token - // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning - std::vector embd; - - int cnt = 0; - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { - printf("Failed to predict\n"); - return 1; - } - cnt += 1; - - if (cnt > 0) - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - llama_sampling_params & sparams = params.sparams; - const int top_k = sparams.top_k; - const float top_p = sparams.top_p; - const float temp = sparams.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) >= params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gpt-2-sparse/main7b.cpp b/examples/gpt-2-sparse/main7b.cpp deleted file mode 100644 index a07a5472..00000000 --- a/examples/gpt-2-sparse/main7b.cpp +++ /dev/null @@ -1,1567 +0,0 @@ -#include "ggml.h" -#include "ggml-alloc.h" -#include - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ggml-cuda.h" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif -typedef void (*offload_func_t)(struct ggml_tensor * tensor); -void opt_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; -} -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct gpt2_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - // struct ggml_tensor * c_attn_attn_w; - // struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_attn_q_w; - struct ggml_tensor * c_attn_attn_q_b; - - struct ggml_tensor * c_attn_attn_k_w; - struct ggml_tensor * c_attn_attn_k_b; - - struct ggml_tensor * c_attn_attn_v_w; - struct ggml_tensor * c_attn_attn_v_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; - - struct ggml_tensor * gpu_idx; - struct ggml_tensor * gpu_bucket; - // gpu heat - struct ggml_tensor * c_mlp_fc_w_gpu; - struct ggml_tensor * c_mlp_proj_w_t; - struct ggml_tensor * c_mlp_proj_w_gpu; - - //predictor - struct ggml_tensor * mlp_pre_w1_w; - struct ggml_tensor * mlp_pre_w2_w; -}; - -struct opt_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - opt_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error("opt_file fail\n"); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - ~opt_file() { - if (fp) { - std::fclose(fp); - } - } -}; -#define _POSIX_MAPPED_FILES -#include -#include - -struct opt_mmap { - void * addr; - size_t size; - - opt_mmap(const opt_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error("mmap failed\n"); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~opt_mmap() { - munmap(addr, size); - } -#else - static constexpr bool SUPPORTED = false; - - opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) { - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -struct gpt2_model { - gpt2_hparams hparams; - struct opt_file * file; - struct opt_mmap * mapping; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; -}; - -struct ggml_context * ctx0 = nullptr; -// std::vector compute_buffer; -void *compute_buffer; - -bool endsWith(const std::string& str, const std::string& suffix) { - if (str.length() < suffix.length()) { - return false; - } - return str.substr(str.length() - suffix.length()) == suffix; -} - - -// load the model's weights from a file -bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); - model.file = new opt_file(fname.c_str(), "rb"); - printf("size %d\n", model.file->size); - model.mapping = new opt_mmap(model.file, 0, false); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - /* int32_t n_vocab = 0; */ - /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */ - - /* if (n_vocab != model.hparams.n_vocab) { */ - /* fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */ - /* __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */ - /* return false; */ - /* } */ - int32_t n_vocab = model.hparams.n_vocab; - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - printf("wtype %d\n", wtype); - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b - - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b - - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b - - //need refactor - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_idx - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32)); // gpu_bucket - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); // c_mlp_fc_w_h20 - ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype)); - //predictor - ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32)); // pre_w - ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32)); // pre_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size = 0; - - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v - - ctx_size += (6 + 12*n_layer)*51200; // object overhead - - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - int main_gpu = 0; -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT -#else -#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU -#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU -#endif - - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD; - // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD; - - // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - // model.wpe = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2); - // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - // model.lm_head->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["output_norm.weight"] = &model.ln_f_g; - model.tensors["output_norm.bias"] = &model.ln_f_b; - - model.tensors["tok_embeddings.weight"] = &model.wte; - model.tensors["pos_embeddings.weight"] = &model.wpe; - model.tensors["output.weight"] = &model.lm_head; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - memset(&layer, 0, sizeof(gpt2_layer)); - - // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - // // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - // layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - // // need refine - // layer.gpu_idx = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4); - // layer.gpu_bucket = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5); - // layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype, n_embd, 2048*5); - - // layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype, n_embd, 4* n_embd); - // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd); - - // if (i <= 10) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd); - // } else if (i <= 12) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd); - // } else if (i <= 18) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd); - - // } else if (i <= 21) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd); - // } else if (i <= 26) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd); - // } else if (i <= 31) { - // layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280); - // layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd); - // } - - // layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD; - // layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD; - - // layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD; - // layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD; - // layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD; - // // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD; - - // map by name - model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = &layer.ln_1_g; - model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"] = &layer.ln_1_b; - - model.tensors["layers." + std::to_string(i) + ".output_norm.weight"] = &layer.ln_2_g; - model.tensors["layers." + std::to_string(i) + ".output_norm.bias"] = &layer.ln_2_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w; - model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w; - model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w; - model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b; - - model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w; - model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = &layer.c_mlp_fc_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"] = &layer.c_mlp_fc_b; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = &layer.c_mlp_proj_w; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"] = &layer.c_mlp_proj_w_t; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"] = &layer.c_mlp_proj_b; - - model.tensors["layers." + std::to_string(i) + ".gpu.weight"] = &layer.gpu_idx; - model.tensors["layers." + std::to_string(i) + ".gpu.bucket"] = &layer.gpu_bucket; - model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"] = &layer.c_mlp_fc_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"] = &layer.c_mlp_proj_w_gpu; - - model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w; - model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w; - } - } - - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - #ifdef GGML_USE_CUBLAS - ggml_cuda_assign_buffers_no_scratch(model.memory_k); - ggml_cuda_assign_buffers_no_scratch(model.memory_v); - #endif - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - ggml_set_no_alloc(ctx, true); - // load weights - { - size_t total_size = 0; - - bool has_lm_head = false; - const std::vector to_gpu = { - "output_norm.bias", - "output_norm.weight", - ".*attention.wq.weight", - ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - // ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - "layers.*output_norm.weight", - "layers.*output_norm.bias", - ".*fc1.weight", - ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_gpu_lv = { - ".*attention.wq.weight", - ".*attention.wq.bias", - ".*attention.wk.weight", - ".*attention.wk.bias", - ".*attention.wv.weight", - ".*attention.wv.bias", - ".*attention.wo.weight", - ".*attention.wo.weight_transpose", - ".*attention.wo.bias", - ".*feed_forward.w1.weight_h20", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_h20$", - // ".*feed_forward.w2.weight_transpose", - /* ".*feed_forward.w2.weight$", */ - ".*feed_forward.w2.bias", - ".*gpu.bucket", - ".*attention_norm.weight", - ".*attention_norm.bias", - // "layers.*output_norm.weight", - // "layers.*output_norm.bias", - // ".*fc1.weight", - // ".*fc2.weight", - // ".*attention.*fc1.weight", - // ".*attention.*fc1.bias", - // ".*attention.*fc2.weight", - // ".*attention.*fc2.bias", - - // "output.weight", - - // "model/h.*/attn/c_proj/w", - // "model/h.*/mlp/c_fc/w", - // "model/h.*/mlp/c_proj/w", - }; - const std::vector to_lock = { - "tok_embeddings.weight", - "pos_embeddings.weight", - // "output_norm.bias", - ".*attention.wq.weight", - ".*attention.wq.bias", - // ".*attention.wo.weight", - // ".*attention.wo.weight_transpose", - // ".*attention.wo.bias", - ".*feed_forward.w1.weight", - ".*feed_forward.w1.bias", - ".*feed_forward.w2.weight_transpose", - // ".*feed_forward.w2.weight", - ".*feed_forward.w2.bias", - ".*gpu.weight", - ".*attention_norm.weight", - ".*attention_norm.bias", - ".*output_norm.weight", - ".*output_norm.bias", - ".*attention.*fc1.weight", - ".*attention.*fc1.bias", - ".*attention.*fc2.weight", - ".*attention.*fc2.bias", - // ".*w2.bias", - // ".*w1.bias", - "output.weight", - }; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - int64_t new_ne[2]; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - new_ne[i] = ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - ggml_tensor ** ptr = model.tensors[name]; - // printf("name %s ptr %p\n", name.c_str(), *ptr); - // int k; - // scanf("%d", &k); - *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne); - - auto tensor = (ggml_tensor *)*model.tensors[name]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - std::streampos offset = fin.tellg(); - // fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - fin.seekg(ggml_nbytes(tensor), std::ios::cur); - tensor->data = model.mapping->addr + static_cast(offset); - // if ( endsWith(name.c_str(), "weight_transpose")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i+4096]); - // } - // } - // printf("\n"); - // if (endsWith(name.c_str(), "weight_h20")) { - // short *d = (short *)tensor->data; - // for (int i = 0; i < 10; i++) { - // printf("%d ", d[i]); - - // } - // int k; - // scanf("%d", &k); - // } - - // // GPT-2 models share the WTE tensor as the LM head - // if (name == "model/wte" && has_lm_head == false) { - // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); - // } - - // if (name == "model/lm_head") { - // has_lm_head = true; - // } - if (model_params.low_vram == false) { - for (const auto &s : to_gpu) - { - if (std::regex_search(name, std::regex(s))) - { - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - } else { - for (const auto &s : to_gpu_lv) - { - if (std::regex_search(name, std::regex(s))) - { - std::regex pattern(R"(\d+)"); - std::smatch match; - int layer_id = 0; - if (std::regex_search(name, match, pattern)) - { - std::string digitStr = match.str(); - int num = std::stoi(digitStr); - layer_id = num; - } - // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers); - if (layer_id > model_params.n_gpu_layers) - break; - // printf("name %s\n", name.c_str()); - tensor->backend = GGML_BACKEND_GPU; - break; - } - } - - } - if (tensor->backend == GGML_BACKEND_GPU) { - #if defined(GGML_USE_CUBLAS) - ggml_cuda_transform_tensor(tensor->data, tensor); - #endif - } - for (const auto &s : to_lock) - { - if (std::regex_match(name, std::regex(s))) - { - if(!mlock(tensor->data, ggml_nbytes(tensor))) { - // printf("mlock %s\n", name.c_str()); - } - else { - printf("mlock failed %s\n", name.c_str()); - } - } - } - - total_size += ggml_nbytes(tensor); - } - ggml_set_no_alloc(ctx, false); - - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - } - - fin.close(); - - return true; -} - -// build the computation graph -struct ggml_cgraph * gpt2_graph( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - - // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data - static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); - // static std::vector buf(buf_size); - static void * buf = ggml_cuda_host_malloc(buf_size); - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - - ctx0 = ggml_init(params); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, embd); - - // avoid writing to tensors if we are only measuring the memory usage - if (!ggml_allocr_is_measure(allocr)) { - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i + 2; - } - } - offload_func_t offload_func = opt_nop; - offload_func_t offload_func_kq = opt_nop; - offload_func_t offload_func_v = opt_nop; - offload_func_t offload_func_nr = opt_nop; - offload_func_t offload_debug = opt_nop; -#ifdef GGML_USE_CUBLAS - offload_debug = ggml_cuda_assign_buffers_no_alloc; - offload_func = ggml_cuda_assign_buffers_no_alloc; - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; -#endif - // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc; - // int k; - // scanf("%d", &k); - - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(allocr, KQ_scale); - if (!ggml_allocr_is_measure(allocr)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - - // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); - ggml_set_name(inpL, "inpL_first"); - // offload_func(inpL); - - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); - offload_func(cur); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_1_g); - offload_func(cur); - ggml_set_name(cur, "ln_1_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_1_b); - ggml_set_name(cur, "ln_1_b"); - // offload_func(cur); - - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - - struct ggml_tensor *k_cpy = nullptr; - struct ggml_tensor *v_cpy = nullptr; - // self-attention - { - // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur); - offload_func_kq(Qcur); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur); - offload_func_kq(Kcur); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b); - offload_func_kq(Kcur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur); - offload_func_v(Vcur); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b); - offload_func_v(Vcur); - - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); - offload_func_v(Vcur); - - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - offload_func_kq(k); - // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v)); - - offload_func_v(v); - k_cpy = ggml_cpy(ctx0, Kcur, k); - offload_func_kq(k_cpy); - ggml_set_name(k_cpy, "k_cpy"); - v_cpy = ggml_cpy(ctx0, Vcur, v); - offload_func_v(v_cpy); - ggml_set_name(v_cpy, "v_cpy"); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N); - offload_func_kq(Qcur); - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - offload_func_kq(Q); - - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - // struct ggml_tensor * K = - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 0, 2, 1, 3); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, model.memory_k, - 128, n_past + N, n_head, - ggml_element_size(model.memory_k)*n_embd, - ggml_element_size(model.memory_k)*128, - ggml_element_size(model.memory_k)*n_embd*n_ctx*il); - K->src[1] = k_cpy; - offload_func_kq(K); - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - KQ_scale); - offload_func_kq(KQ_scaled); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, - n_past + N, 128, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*128, - n_ctx*ggml_element_size(model.memory_k)*n_embd*il); - V->src[1] = v_cpy; - offload_func_v(V); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merge_cont"); - offload_func_v(cur); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - ggml_set_name(cur, "attn_proj"); - offload_func(cur); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_attn_proj_b); - ggml_set_name(cur, "attn_bias"); - offload_func(cur); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - ggml_set_name(cur, "after attn"); - - struct ggml_tensor * inpFF = cur; - - // feed-forward network - { - ggml_tensor *idx = nullptr; - ggml_tensor *idx_g = nullptr; - ggml_tensor *cur_c = nullptr; - - // norm - { - cur = ggml_norm(ctx0, inpFF, hparams.eps); - offload_func(cur); - ggml_set_name(cur, "norm_FFN"); - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_mul(ctx0, - cur, - model.layers[il].ln_2_g); - offload_func(cur); - ggml_set_name(cur, "norm_FFN_g"); - cur = ggml_add(ctx0, - cur, - model.layers[il].ln_2_b); - // offload_func(cur); - // ggml_set_name(cur, "norm_FFN_w"); - // cur_c = ggml_dup(ctx0, cur); - } - // if (N == 1) - if (1) - { - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w1_w, - cur); - offload_func(idx); - ggml_set_name(idx, "mlp_pre_w1"); - idx = ggml_relu(ctx0, idx); - offload_func(idx); - ggml_set_name(idx, "relu_pre"); - idx = ggml_mul_mat(ctx0, - model.layers[il].mlp_pre_w2_w, - idx); - ggml_set_name(idx, "mlp_pre_w2"); - // offload_func(idx); - // idx = ggml_sigmoid(ctx0, idx); - // offload_func(idx); - // idx_g = idx; - // idx = ggml_dup(ctx0, idx_g); - // ggml_set_name(idx, "idx_cpu_dup"); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - if (N >= 80) - // if (0) - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_fc_w, - cur); - offload_func(cur); - ggml_set_name(cur, "up_ffn"); - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_fc_b); - offload_func(cur); - } - else - { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_fc_w, - // cur); - // offload_func(cur); - // cur = ggml_add(ctx0, - // cur, - // model.layers[il].c_mlp_fc_b); - // offload_func(cur); - - - struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0, - model.layers[il].c_mlp_fc_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "mlp_up_gpu"); - offload_func(tmp); - offload_debug(tmp); - cur = ggml_mul_mat_idx(ctx0, - model.layers[il].c_mlp_fc_w, - cur, - idx, - model.layers[il].gpu_idx); - ggml_set_name(cur, "mlp_up_cpu"); - tmp = ggml_add_idx(ctx0, - tmp, - model.layers[il].c_mlp_fc_b, - idx); - ggml_set_name(tmp, "mlp_up_bias"); - offload_debug(tmp); - offload_func(tmp); - - cur = ggml_add(ctx0, cur, tmp); - ggml_set_name(cur, "mlp_up_mix"); - offload_func(cur); - - // cur = tmp; - - } - - - - // GELU activation - // [3072, N] - cur = ggml_relu(ctx0, cur); - // cur_c = cur; - // offload_func(cur); - cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - if (N >= 80) { - // if (0) { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_proj_w, - cur); - offload_func(cur); - ggml_set_name(cur, "down_ffn"); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_proj_b); - offload_func(cur); - } - else { - // cur = ggml_mul_mat(ctx0, - // model.layers[il].c_mlp_proj_w, - // cur); - // offload_func(cur); - - // cur = ggml_axpy(ctx0, - // model.layers[il].c_mlp_proj_w_t, - // cur, - // NULL, - // NULL); - // offload_func(cur); - - - // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, - // model.layers[il].c_mlp_proj_w_gpu, - // cur, - // model.layers[il].gpu_bucket, - // NULL); - struct ggml_tensor *tmp = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_gpu, - cur, - idx, - model.layers[il].gpu_bucket); - ggml_set_name(tmp, "axpy"); - offload_func(tmp); - offload_debug(tmp); - - cur = ggml_axpy(ctx0, - model.layers[il].c_mlp_proj_w_t, - cur_c, - idx, - model.layers[il].gpu_idx); - - cur = ggml_add(ctx0, cur, tmp); - offload_func(cur); - - cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b); - offload_func(cur); - - // tmp = ggml_add(ctx0, - // tmp, - // model.layers[il].c_mlp_proj_b); - // offload_func(tmp); - // offload_debug(tmp); - - // cur = tmp; - } - - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - offload_func(inpL); - } - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); - offload_func_nr(inpL); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_mul(ctx0, - inpL, - model.ln_f_g); - offload_func_nr(inpL); - inpL = ggml_add(ctx0, - inpL, - model.ln_f_b); - ggml_set_name(inpL, "before"); - offload_func_nr(inpL); - } - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - ggml_set_name(inpL, "last_layer"); -// offload_func(inpL); - - // logits -> probs - //inpL = ggml_soft_max(ctx0, inpL); - - ggml_build_forward_expand(gf, inpL); - - ggml_free(ctx0); - - return gf; -} - -// evaluate the transformer -// -// - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt2_eval( - const gpt2_model & model, - struct ggml_allocr * allocr, - const int n_threads, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_vocab = hparams.n_vocab; - - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); - - // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); - -#ifdef GGML_USE_CUBLAS - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data()); - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer); - } - } -#endif - - - - // run the computation - struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); - static std::vector work_buffer; - work_buffer.resize(plan.work_size); - plan.work_data = work_buffer.data(); - ggml_graph_compute(gf, &plan); - - //if (n_past%100 == 0) { - // ggml_graph_print (gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; - - //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - - // return result just for the last token - embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/gpt-2-117M/ggml-model.bin"; - - if (!gpt_params_parse(argc, argv, params)) { - return 1; - } - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gpt2_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_model_load(params.model, model, vocab, params)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, "hello world"); - } - printf("load finish\n"); - - // keep this buffer alive while evaluating the model - - struct ggml_allocr * allocr = NULL; - // allocate the compute buffer - { - allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); - - // create the worst case graph for memory usage estimation - int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); - int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - // compute_buffer.resize(mem_size); - compute_buffer = ggml_cuda_host_malloc(mem_size); - // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); - allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN); - - fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); - for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { - printf("%d ", embd_inp[i]); - } - printf("\n\n"); - - // submit the input prompt token-by-token - // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning - std::vector embd; - - int cnt = 0; - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) { - printf("Failed to predict\n"); - return 1; - } - cnt += 1; - - if (cnt > 0) - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - llama_sampling_params & sparams = params.sparams; - const int top_k = sparams.top_k; - const float top_p = sparams.top_p; - const float temp = sparams.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) >= params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt)); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} diff --git a/examples/gpt-2-sparse/quantize.cpp b/examples/gpt-2-sparse/quantize.cpp deleted file mode 100644 index f81c04e8..00000000 --- a/examples/gpt-2-sparse/quantize.cpp +++ /dev/null @@ -1,184 +0,0 @@ -#include "ggml.h" - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; -}; - -// quantize a model -bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { - gpt_vocab vocab; - - printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); - - auto finp = std::ifstream(fname_inp, std::ios::binary); - if (!finp) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); - return false; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - finp.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); - return false; - } - - fout.write((char *) &magic, sizeof(magic)); - } - - gpt2_hparams hparams; - - // load hparams - { - finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; - const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); - printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); - printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); - printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); - - fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); - fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &ftype_dst, sizeof(ftype_dst)); - } - - // load vocab - { - int32_t n_vocab = 0; - finp.read ((char *) &n_vocab, sizeof(n_vocab)); - fout.write((char *) &n_vocab, sizeof(n_vocab)); - - if (n_vocab != hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); - return false; - } - - std::string word; - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - finp.read ((char *) &len, sizeof(len)); - fout.write((char *) &len, sizeof(len)); - - word.resize(len); - finp.read ((char *) word.data(), len); - fout.write((char *) word.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // regexes of tensor names to be quantized - const std::vector to_quant = { - "model/wte", - "model/lm_head", - "model/h.*/attn/c_attn/w", - "model/h.*/attn/c_proj/w", - "model/h.*/mlp/c_fc/w", - "model/h.*/mlp/c_proj/w", - }; - - if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { - fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); - return false; - } - - finp.close(); - fout.close(); - - return true; -} - -// usage: -// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type -// -int main(int argc, char ** argv) { - if (argc != 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - ggml_print_ftypes(stderr); - return 1; - } - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; - - const ggml_ftype ftype = ggml_parse_ftype(argv[3]); - - const int64_t t_main_start_us = ggml_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = ggml_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - return 0; -} diff --git a/llama.cpp b/llama.cpp index 3ae9e946..2a9ea030 100644 --- a/llama.cpp +++ b/llama.cpp @@ -230,6 +230,7 @@ enum llm_arch { LLM_ARCH_GPT2, LLM_ARCH_GPTJ, LLM_ARCH_GPTNEOX, + LLM_ARCH_OPT, LLM_ARCH_MPT, LLM_ARCH_STARCODER, LLM_ARCH_PERSIMMON, @@ -246,6 +247,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPTJ, "gptj" }, { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_OPT, "opt" }, { LLM_ARCH_MPT, "mpt" }, { LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_STARCODER, "starcoder" }, @@ -483,6 +485,23 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_OPT, + { + {LLM_TENSOR_TOKEN_EMBD, "token_embd"}, + {LLM_TENSOR_POS_EMBD, "position_embd"}, + {LLM_TENSOR_OUTPUT_NORM, "output_norm"}, + {LLM_TENSOR_OUTPUT, "output"}, + {LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"}, + {LLM_TENSOR_ATTN_Q, "blk.%d.attn_q"}, + {LLM_TENSOR_ATTN_K, "blk.%d.attn_k"}, + {LLM_TENSOR_ATTN_V, "blk.%d.attn_v"}, + {LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"}, + {LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"}, + {LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"}, + {LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"}, + }, + }, { LLM_ARCH_PERSIMMON, { @@ -1321,6 +1340,9 @@ struct llama_layer { struct ggml_tensor * wqkv; // attention bias + struct ggml_tensor * bq; + struct ggml_tensor * bk; + struct ggml_tensor * bv; struct ggml_tensor * bo; struct ggml_tensor * bqkv; @@ -2341,6 +2363,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_OPT: + { + // TODO: GGUF_GET_KEY & support different model versions + hparams.n_ctx_train = 2050; // TODO: hard coded for now + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 48: model.type = e_model::MODEL_30B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_FALCON: { GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); @@ -3229,6 +3262,10 @@ static void llm_load_sparse_model_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_OPT: + { + // TODO: load sparse tensor model + } break; case LLM_ARCH_FALCON: { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -3482,6 +3519,81 @@ static void llm_load_tensors( } } } break; + case LLM_ARCH_OPT: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = llama_backend_offload; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload; +#endif // _WIN32 + + backend_output = llama_backend_offload_split; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + // model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, backend_output); // same as token_embed + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + } + // if (backend_output == GGML_BACKEND_GPU_SPLIT) { + // vram_weights += ggml_nbytes(model.output); + // } + } + const uint32_t n_ff = hparams.n_ff; + const int i_gpu_start = n_layer - n_gpu_layers; + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); + layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend_split); + + layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend_split); + + layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend_split); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + } + } + } break; case LLM_ARCH_BAICHUAN: { model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); @@ -4928,6 +5040,103 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_opt() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + struct ggml_tensor * cur; + struct ggml_tensor * pos; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + std::tie(k_cpy, v_cpy) = llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + } + // add input residual + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_RELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + // input for next layer + inpL = cur; + } + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; + } + struct ggml_cgraph * build_baichuan() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -6440,6 +6649,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_stablelm(); } break; + case LLM_ARCH_OPT: + { + result = llm.build_opt(); + } break; default: GGML_ASSERT(false); } From bc6e190bc1136c026da530eff784e726741e68ce Mon Sep 17 00:00:00 2001 From: "a.r.l" Date: Thu, 23 Jan 2025 21:16:20 +0800 Subject: [PATCH 3/4] feat: add sparse inference of opt --- convert-hf-to-powerinfer-gguf.py | 60 +++++++++++++++++++++++++ gguf-py/gguf/constants.py | 17 ++++++- gguf-py/gguf/tensor_mapping.py | 11 +++++ llama.cpp | 76 +++++++++++++++++++++++++++++--- 4 files changed, 158 insertions(+), 6 deletions(-) diff --git a/convert-hf-to-powerinfer-gguf.py b/convert-hf-to-powerinfer-gguf.py index 181fe972..28d77bdd 100644 --- a/convert-hf-to-powerinfer-gguf.py +++ b/convert-hf-to-powerinfer-gguf.py @@ -185,6 +185,8 @@ def from_model_architecture(model_architecture): return FalconModel if model_architecture == "LlamaForCausalLM": return LlamaModel + if model_architecture == "OPTForCausalLM": + return OptModel raise NotImplementedError(f'Architecture "{model_architecture}" not supported!') @@ -218,6 +220,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.FALCON if arch == "RWForCausalLM" or arch == "LlamaForCausalLM": return gguf.MODEL_ARCH.LLAMA + if arch == "OPTForCausalLM": + return gguf.MODEL_ARCH.OPT raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -513,7 +517,63 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) +class OptModel(Model): + def set_gguf_parameters(self, params: PredictorParams): + self.gguf_writer.add_name("opt") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_feed_forward_length(self.hparams["ffn_dim"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + # self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_file_type(self.ftype) + + if params.sparse_threshold is not None: + self.gguf_writer.add_sparse_threshold(params.sparse_threshold) + + def write_tensors(self): + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = self._translate_tensor_key(name) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + # We need to transpose the weight matrices for the FFN Down layers to support the + # Axpy operation in PowerInfer. So we don't need to transpose them at runtime. + if "ffn_down" in new_name: + new_name = new_name.replace("ffn_down", "ffn_down_t") + data = data.T + + n_dims = len(data.shape) + data_dtype = data.dtype + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) @dataclass class PredictorParams: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e82df27b..9459b477 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -90,6 +90,7 @@ class MODEL_ARCH(IntEnum): GPT2 = auto() GPTJ = auto() GPTNEOX = auto() + OPT = auto() MPT = auto() STARCODER = auto() PERSIMMON = auto() @@ -135,6 +136,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.OPT: "opt", MODEL_ARCH.MPT: "mpt", MODEL_ARCH.STARCODER: "starcoder", MODEL_ARCH.PERSIMMON: "persimmon", @@ -356,7 +358,20 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GPT2: [ # TODO ], - # TODO + MODEL_ARCH.OPT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], } # tensors that will not be serialized diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 2c813050..641b81f0 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -11,6 +11,7 @@ class TensorNameMap: MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox "transformer.wte", # gpt2 gpt-j mpt refact + "decoder.embed_tokens", # opt "transformer.word_embeddings", # falcon "word_embeddings", # bloom "model.embed_tokens", # llama-hf @@ -33,6 +34,7 @@ class TensorNameMap: MODEL_TENSOR.POS_EMBD: ( "transformer.wpe", # gpt2 "embeddings.position_embeddings", # bert + "decoder.embed_positions", # opt ), # Output @@ -47,6 +49,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon + "decoder.final_layer_norm", # opt "model.norm", # llama-hf baichuan "norm", # llama-pth "embeddings.LayerNorm", # bert @@ -66,6 +69,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_NORM: ( "gpt_neox.layers.{bid}.input_layernorm", # gptneox "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact + "decoder.layers.{bid}.self_attn_layer_norm", # opt "transformer.blocks.{bid}.norm_1", # mpt "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom @@ -98,6 +102,7 @@ class TensorNameMap: "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j + "decoder.layers.{bid}.self_attn.q_proj", # opt ), # Attention key @@ -106,6 +111,7 @@ class TensorNameMap: "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j + "decoder.layers.{bid}.self_attn.k_proj", # opt ), # Attention value @@ -114,12 +120,14 @@ class TensorNameMap: "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j + "decoder.layers.{bid}.self_attn.v_proj", # opt ), # Attention output MODEL_TENSOR.ATTN_OUT: ( "gpt_neox.layers.{bid}.attention.dense", # gptneox "transformer.h.{bid}.attn.c_proj", # gpt2 refact + "decoder.layers.{bid}.self_attn.out_proj", # opt "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom @@ -140,6 +148,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_NORM: ( "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox "transformer.h.{bid}.ln_2", # gpt2 refact + "decoder.layers.{bid}.final_layer_norm", # opt "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf @@ -153,6 +162,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP: ( "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox "transformer.h.{bid}.mlp.c_fc", # gpt2 + "decoder.layers.{bid}.fc1", # opt "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "h.{bid}.mlp.dense_h_to_4h", # bloom @@ -173,6 +183,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox "transformer.h.{bid}.mlp.c_proj", # gpt2 refact + "decoder.layers.{bid}.fc2", # opt "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "h.{bid}.mlp.dense_4h_to_h", # bloom diff --git a/llama.cpp b/llama.cpp index 2a9ea030..aad7f9c4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -247,7 +247,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPTJ, "gptj" }, { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_OPT, "opt" }, + { LLM_ARCH_OPT, "opt" }, { LLM_ARCH_MPT, "mpt" }, { LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_STARCODER, "starcoder" }, @@ -499,7 +499,10 @@ static std::map> LLM_TENSOR_NAMES = {LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"}, {LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"}, {LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"}, + {LLM_TENSOR_FFN_DOWN_T, "blk.%d.ffn_down_t"}, {LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"}, + { LLM_TENSOR_MLP_PRED_FC1, "blk.%d.fc1" }, + { LLM_TENSOR_MLP_PRED_FC2, "blk.%d.fc2" }, }, }, { @@ -3264,7 +3267,47 @@ static void llm_load_sparse_model_tensors( } break; case LLM_ARCH_OPT: { - // TODO: load sparse tensor model + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); + // output + { + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + // model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + } + + const uint32_t n_ff = hparams.n_ff; + model.layers.resize(n_layer); + + for (uint32_t &i = current_layer; i < n_layer; ++i) { + auto & layer = model.layers[i]; + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down_t = create_tensor(tn(LLM_TENSOR_FFN_DOWN_T, "weight", i), {n_embd, n_ff}); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_T, "bias", i), {n_embd}); + + layer.mlp_pre_w1 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC1, "weight", i), {n_embd, GGML_NE_WILDCARD}); + layer.mlp_pre_w2 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC2, "weight", i), {GGML_NE_WILDCARD, n_ff}); + + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + } } break; case LLM_ARCH_FALCON: { @@ -5110,14 +5153,37 @@ struct llm_build_context { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + if(llama_use_sparse_inference(&model)) { + llm_build_cb_short cbs = [&](ggml_tensor * cur, const char * name) { + std::string name_str = std::string(name) + "-" + std::to_string(il); + ggml_set_name(cur, name_str.c_str()); + }; + // We only offload the ffn input to GPU if all neurons are offloaded + if (model.layers[il].gpu_offload_ratio >= 1.) { + cb(cur, "ffn_norm", il); + } else { + cbs(cur, "ffn_norm"); + } + cur = llm_build_ffn_sparse(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down_t, model.layers[il].ffn_down_b, + model.layers[il].mlp_pre_w1, + model.layers[il].mlp_pre_w2, + ffn_inp, + model.layers[il].gpu_idx, + model.layers[il].gpu_bucket, model.layers[il].ffn_gate_gpu, model.layers[il].ffn_down_gpu, model.layers[il].ffn_up_gpu, + LLM_FFN_RELU, LLM_FFN_SEQ, model.layers[il].gpu_offload_ratio, cbs); + } else { + cb(cur, "ffn_norm", il); + cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, LLM_FFN_RELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il); + } } cur = ggml_add(ctx0, cur, ffn_inp); From 7066d20aa179dd15e9c5c18c4af13719f3e0f81c Mon Sep 17 00:00:00 2001 From: "a.r.l" Date: Tue, 18 Feb 2025 21:12:29 +0800 Subject: [PATCH 4/4] feat: fix sparse problems --- common/common-ggml.cpp | 253 ----------------------- common/common-ggml.h | 18 -- convert-hf-to-powerinfer-gguf.py | 2 +- llama.cpp | 3 + powerinfer-py/powerinfer/export_split.py | 5 +- 5 files changed, 8 insertions(+), 273 deletions(-) delete mode 100644 common/common-ggml.cpp delete mode 100644 common/common-ggml.h diff --git a/common/common-ggml.cpp b/common/common-ggml.cpp deleted file mode 100644 index 794607c6..00000000 --- a/common/common-ggml.cpp +++ /dev/null @@ -1,253 +0,0 @@ -#include "common-ggml.h" - -#include -#include - -static const std::map GGML_FTYPE_MAP = { - {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, - {"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, - {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, - {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, - {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, -}; - -void ggml_print_ftypes(FILE * fp) { - for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) { - fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second); - } -} - -enum ggml_ftype ggml_parse_ftype(const char * str) { - enum ggml_ftype ftype; - if (str[0] == 'q') { - const auto it = GGML_FTYPE_MAP.find(str); - if (it == GGML_FTYPE_MAP.end()) { - fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str); - return GGML_FTYPE_UNKNOWN; - } - ftype = it->second; - } else { - ftype = (enum ggml_ftype) atoi(str); - } - - return ftype; -} - -bool ggml_common_quantize_0( - std::ifstream & finp, - std::ofstream & fout, - const ggml_ftype ftype, - const std::vector & to_quant, - const std::vector & to_skip) { - - ggml_type qtype = GGML_TYPE_F32; - - switch (ftype) { - case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; - case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; - case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; - case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; - case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; - case GGML_FTYPE_UNKNOWN: - case GGML_FTYPE_ALL_F32: - case GGML_FTYPE_MOSTLY_F16: - case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: - case GGML_FTYPE_MOSTLY_Q2_K: - case GGML_FTYPE_MOSTLY_Q3_K: - case GGML_FTYPE_MOSTLY_Q4_K: - case GGML_FTYPE_MOSTLY_Q5_K: - case GGML_FTYPE_MOSTLY_Q6_K: - { - fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); - return false; - } - }; - - if (!ggml_is_quantized(qtype)) { - fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype)); - return false; - } - - size_t total_size_org = 0; - size_t total_size_new = 0; - - std::vector work; - - std::vector data_u8; - std::vector data_f16; - std::vector data_f32; - - std::vector hist_all(1 << 4, 0); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - finp.read(reinterpret_cast(&length), sizeof(length)); - finp.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (finp.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[4] = { 1, 1, 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - finp.read (&name[0], length); - - printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); - - bool quantize = false; - - // check if we should quantize this tensor - for (const auto & s : to_quant) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; - } - } - - // check if we should skip this tensor - for (const auto & s : to_skip) { - if (std::regex_match(name, std::regex(s))) { - quantize = false; - break; - } - } - - // quantize only 2D tensors - quantize &= (n_dims == 2); - - if (quantize) { - if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { - fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); - return false; - } - - if (ttype == GGML_TYPE_F16) { - data_f16.resize(nelements); - finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); - data_f32.resize(nelements); - for (int i = 0; i < nelements; ++i) { - data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); - } - } else { - data_f32.resize(nelements); - finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); - } - - ttype = qtype; - } else { - // const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t); - int bpe = -1; - if (ttype == 0 || ttype == 18) { - bpe = sizeof(float); - } - else { - bpe = sizeof(uint16_t); - } - - data_u8.resize(nelements*bpe); - finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); - } - - fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); - fout.write(reinterpret_cast(&length), sizeof(length)); - fout.write(reinterpret_cast(&ttype), sizeof(ttype)); - for (int i = 0; i < n_dims; ++i) { - fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); - } - fout.write(&name[0], length); - - if (quantize) { - work.resize(nelements); // for quantization - - size_t cur_size = 0; - std::vector hist_cur(1 << 4, 0); - - switch ((ggml_type) ttype) { - case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q5_0: - { - cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q5_1: - { - cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q8_0: - { - cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_Q8_K: - case GGML_TYPE_COUNT: - { - fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); - return false; - } - } - - fout.write(reinterpret_cast(work.data()), cur_size); - total_size_new += cur_size; - - printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < (int) hist_cur.size(); ++i) { - hist_all[i] += hist_cur[i]; - } - - for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / (float)nelements); - } - printf("\n"); - } else { - printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); - fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); - total_size_new += data_u8.size(); - } - - total_size_org += nelements * sizeof(float); - } - - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); - - { - int64_t sum_all = 0; - for (int i = 0; i < (int) hist_all.size(); ++i) { - sum_all += hist_all[i]; - } - - printf("%s: hist: ", __func__); - for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / (float)sum_all); - } - printf("\n"); - } - - return true; -} diff --git a/common/common-ggml.h b/common/common-ggml.h deleted file mode 100644 index 29ba4ad5..00000000 --- a/common/common-ggml.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include "ggml.h" - -#include -#include -#include - -enum ggml_ftype ggml_parse_ftype(const char * str); - -void ggml_print_ftypes(FILE * fp = stderr); - -bool ggml_common_quantize_0( - std::ifstream & finp, - std::ofstream & fout, - const ggml_ftype ftype, - const std::vector & to_quant, - const std::vector & to_skip); \ No newline at end of file diff --git a/convert-hf-to-powerinfer-gguf.py b/convert-hf-to-powerinfer-gguf.py index 28d77bdd..0aa4632e 100644 --- a/convert-hf-to-powerinfer-gguf.py +++ b/convert-hf-to-powerinfer-gguf.py @@ -520,7 +520,7 @@ def write_tensors(self): class OptModel(Model): def set_gguf_parameters(self, params: PredictorParams): self.gguf_writer.add_name("opt") - self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_context_length(2050) # not in config.json self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_feed_forward_length(self.hparams["ffn_dim"]) diff --git a/llama.cpp b/llama.cpp index aad7f9c4..ac52908a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6483,6 +6483,9 @@ static struct ggml_cgraph * llama_build_graph( for (int i = 0; i < n_tokens; ++i) { data[i] = batch.pos[i]; + if(model.arch == LLM_ARCH_OPT) { + data[i] += 2; + } } } diff --git a/powerinfer-py/powerinfer/export_split.py b/powerinfer-py/powerinfer/export_split.py index 9a773b26..7f230d8c 100644 --- a/powerinfer-py/powerinfer/export_split.py +++ b/powerinfer-py/powerinfer/export_split.py @@ -1,11 +1,14 @@ import argparse import pickle -import gguf +import sys from gguf.constants import GGMLQuantizationType from gguf.gguf_writer import GGUFWriter import torch from pathlib import Path import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf import struct import numpy as np import re