From 9f01e7807ab30c31e15607cd0fe2fce07774523e Mon Sep 17 00:00:00 2001
From: "a.r.l" <luminyouyazj@126.com>
Date: Thu, 2 Jan 2025 19:22:53 +0800
Subject: [PATCH 1/4] feat: add opt model example

---
 common/common-ggml.cpp                        |  253 +++
 common/common-ggml.h                          |   18 +
 examples/gpt-2-sparse/CMakeLists.txt          |   15 +
 examples/gpt-2-sparse/README.md               |  158 ++
 .../gpt-2-sparse/convert-cerebras-to-ggml.py  |  183 ++
 examples/gpt-2-sparse/convert-ckpt-to-ggml.py |  159 ++
 examples/gpt-2-sparse/convert-h5-to-ggml.py   |  195 ++
 examples/gpt-2-sparse/download-ggml-model.sh  |   69 +
 examples/gpt-2-sparse/download-model.sh       |   48 +
 examples/gpt-2-sparse/main-30b.cpp            | 1593 +++++++++++++++++
 examples/gpt-2-sparse/main.cpp_123            | 1592 ++++++++++++++++
 examples/gpt-2-sparse/main.cpp_bak            | 1546 ++++++++++++++++
 examples/gpt-2-sparse/main13b.cpp             | 1583 ++++++++++++++++
 examples/gpt-2-sparse/main7b.cpp              | 1567 ++++++++++++++++
 examples/gpt-2-sparse/quantize.cpp            |  184 ++
 15 files changed, 9163 insertions(+)
 create mode 100644 common/common-ggml.cpp
 create mode 100644 common/common-ggml.h
 create mode 100644 examples/gpt-2-sparse/CMakeLists.txt
 create mode 100644 examples/gpt-2-sparse/README.md
 create mode 100644 examples/gpt-2-sparse/convert-cerebras-to-ggml.py
 create mode 100644 examples/gpt-2-sparse/convert-ckpt-to-ggml.py
 create mode 100644 examples/gpt-2-sparse/convert-h5-to-ggml.py
 create mode 100755 examples/gpt-2-sparse/download-ggml-model.sh
 create mode 100755 examples/gpt-2-sparse/download-model.sh
 create mode 100644 examples/gpt-2-sparse/main-30b.cpp
 create mode 100644 examples/gpt-2-sparse/main.cpp_123
 create mode 100644 examples/gpt-2-sparse/main.cpp_bak
 create mode 100644 examples/gpt-2-sparse/main13b.cpp
 create mode 100644 examples/gpt-2-sparse/main7b.cpp
 create mode 100644 examples/gpt-2-sparse/quantize.cpp
diff --git a/common/common-ggml.cpp b/common/common-ggml.cpp
new file mode 100644
index 00000000..794607c6
--- /dev/null
+++ b/common/common-ggml.cpp
@@ -0,0 +1,253 @@
+#include "common-ggml.h"
+
+#include <regex>
+#include <map>
+
+static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
+    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
+    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+};
+
+void ggml_print_ftypes(FILE * fp) {
+    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
+        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+    }
+}
+
+enum ggml_ftype ggml_parse_ftype(const char * str) {
+    enum ggml_ftype ftype;
+    if (str[0] == 'q') {
+        const auto it = GGML_FTYPE_MAP.find(str);
+        if (it == GGML_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
+            return GGML_FTYPE_UNKNOWN;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum ggml_ftype) atoi(str);
+    }
+
+    return ftype;
+}
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip) {
+
+    ggml_type qtype = GGML_TYPE_F32;
+
+    switch (ftype) {
+        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_UNKNOWN:
+        case GGML_FTYPE_ALL_F32:
+        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        case GGML_FTYPE_MOSTLY_Q2_K:
+        case GGML_FTYPE_MOSTLY_Q3_K:
+        case GGML_FTYPE_MOSTLY_Q4_K:
+        case GGML_FTYPE_MOSTLY_Q5_K:
+        case GGML_FTYPE_MOSTLY_Q6_K:
+                {
+                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+                    return false;
+                }
+    };
+
+    if (!ggml_is_quantized(qtype)) {
+        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
+        return false;
+    }
+
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<float> work;
+
+    std::vector<uint8_t>     data_u8;
+    std::vector<ggml_fp16_t> data_f16;
+    std::vector<float>       data_f32;
+
+    std::vector<int64_t> hist_all(1 << 4, 0);
+
+    while (true) {
+        int32_t n_dims;
+        int32_t length;
+        int32_t ttype;
+
+        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
+        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+        if (finp.eof()) {
+            break;
+        }
+
+        int32_t nelements = 1;
+        int32_t ne[4] = { 1, 1, 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+            nelements *= ne[i];
+        }
+
+        std::string name(length, 0);
+        finp.read (&name[0], length);
+
+        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
+
+        bool quantize = false;
+
+        // check if we should quantize this tensor
+        for (const auto & s : to_quant) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // check if we should skip this tensor
+        for (const auto & s : to_skip) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = false;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (n_dims == 2);
+
+        if (quantize) {
+            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                return false;
+            }
+
+            if (ttype == GGML_TYPE_F16) {
+                data_f16.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                data_f32.resize(nelements);
+                for (int i = 0; i < nelements; ++i) {
+                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                }
+            } else {
+                data_f32.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+            }
+
+            ttype = qtype;
+        } else {
+            // const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
+            int bpe = -1;
+            if (ttype == 0 || ttype == 18) {
+                bpe = sizeof(float);
+            }
+            else {
+                bpe = sizeof(uint16_t);
+            }
+
+            data_u8.resize(nelements*bpe);
+            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
+        }
+
+        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
+        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+        for (int i = 0; i < n_dims; ++i) {
+            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+        }
+        fout.write(&name[0], length);
+
+        if (quantize) {
+            work.resize(nelements); // for quantization
+
+            size_t cur_size = 0;
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch ((ggml_type) ttype) {
+                case GGML_TYPE_Q4_0:
+                    {
+                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q4_1:
+                    {
+                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_I8:
+                case GGML_TYPE_I16:
+                case GGML_TYPE_I32:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
+                case GGML_TYPE_Q8_K:
+                case GGML_TYPE_COUNT:
+                    {
+                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                        return false;
+                    }
+            }
+
+            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
+            total_size_new += cur_size;
+
+            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                hist_all[i] += hist_cur[i];
+            }
+
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                printf("%5.3f ", hist_cur[i] / (float)nelements);
+            }
+            printf("\n");
+        } else {
+            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+            total_size_new += data_u8.size();
+        }
+
+        total_size_org += nelements * sizeof(float);
+    }
+
+    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
+
+    {
+        int64_t sum_all = 0;
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
diff --git a/common/common-ggml.h b/common/common-ggml.h
new file mode 100644
index 00000000..29ba4ad5
--- /dev/null
+++ b/common/common-ggml.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <fstream>
+#include <vector>
+#include <string>
+
+enum ggml_ftype ggml_parse_ftype(const char * str);
+
+void ggml_print_ftypes(FILE * fp = stderr);
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip);
\ No newline at end of file
diff --git a/examples/gpt-2-sparse/CMakeLists.txt b/examples/gpt-2-sparse/CMakeLists.txt
new file mode 100644
index 00000000..a06b42dc
--- /dev/null
+++ b/examples/gpt-2-sparse/CMakeLists.txt
@@ -0,0 +1,15 @@
+#
+# gpt-2
+
+set(TEST_TARGET gpt-2-sparse)
+add_executable(${TEST_TARGET} main7b.cpp)
+# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+target_link_libraries(${TEST_TARGET} PRIVATE common  llama ${CMAKE_THREAD_LIBS_INIT})
+
+#
+# gpt-2-quantize
+
+set(TEST_TARGET gpt-2-quantize)
+add_executable(${TEST_TARGET} quantize.cpp)
+# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
diff --git a/examples/gpt-2-sparse/README.md b/examples/gpt-2-sparse/README.md
new file mode 100644
index 00000000..509fabc5
--- /dev/null
+++ b/examples/gpt-2-sparse/README.md
@@ -0,0 +1,158 @@
+# gpt-2
+
+This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.
+
+The program runs on the CPU - no video card is required.
+
+The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.
+
+The example supports the following GPT-2 models:
+
+| Model | Description  | Disk Size |
+| ---   | ---          | ---       |
+| 117M  | Small model  | 240 MB    |
+| 345M  | Medium model | 680 MB    |
+| 774M  | Large model  | 1.5 GB    |
+| 1558M | XL model     | 3.0 GB    |
+
+Sample performance on MacBook M1 Pro:
+
+| Model | Size  | Time / Token |
+| ---   | ---   | ---    |
+| GPT-2 |  117M |   5 ms |
+| GPT-2 |  345M |  12 ms |
+| GPT-2 |  774M |  23 ms |
+| GPT-2 | 1558M |  42 ms |
+
+*TODO: add tables for Cerebras-GPT models*
+
+Sample output:
+
+```
+$ ./bin/gpt-2 -h
+usage: ./bin/gpt-2 [options]
+
+options:
+  -h, --help            show this help message and exit
+  -s SEED, --seed SEED  RNG seed (default: -1)
+  -t N, --threads N     number of threads to use during computation (default: 8)
+  -p PROMPT, --prompt PROMPT
+                        prompt to start generation with (default: random)
+  -n N, --n_predict N   number of tokens to predict (default: 200)
+  --top_k N             top-k sampling (default: 40)
+  --top_p N             top-p sampling (default: 0.9)
+  --temp N              temperature (default: 1.0)
+  -b N, --batch_size N  batch size for prompt processing (default: 8)
+  -m FNAME, --model FNAME
+                        model path (default: models/gpt-2-117M/ggml-model.bin)
+
+$ ./bin/gpt-2
+gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
+gpt2_model_load: n_vocab = 50257
+gpt2_model_load: n_ctx   = 1024
+gpt2_model_load: n_embd  = 768
+gpt2_model_load: n_head  = 12
+gpt2_model_load: n_layer = 12
+gpt2_model_load: f16     = 1
+gpt2_model_load: ggml ctx size = 311.12 MB
+gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
+gpt2_model_load: model size  =   239.08 MB
+main: number of tokens in prompt = 1
+
+So this is going to be the end of the line for us.
+
+If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.
+
+Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.
+
+We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>
+
+main: mem per token =  2048612 bytes
+main:     load time =   106.32 ms
+main:   sample time =     7.10 ms
+main:  predict time =   506.40 ms / 5.06 ms per token
+main:    total time =   629.84 ms
+```
+
+## Downloading and converting the original models (GPT-2)
+
+You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
+in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
+via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.
+
+Here is the entire process for the GPT-2 117M model (download from official site + conversion):
+
+```
+cd ggml/build
+../examples/gpt-2/download-model.sh 117M
+
+Downloading model 117M ...
+models/gpt-2-117M/checkpoint                      100%[=============================>]      77  --.-KB/s    in 0s
+models/gpt-2-117M/encoder.json                    100%[=============================>]   1018K  1.20MB/s    in 0.8s
+models/gpt-2-117M/hparams.json                    100%[=============================>]      90  --.-KB/s    in 0s
+models/gpt-2-117M/model.ckpt.data-00000-of-00001  100%[=============================>] 474.70M  1.21MB/s    in 8m 39s
+models/gpt-2-117M/model.ckpt.index                100%[=============================>]   5.09K  --.-KB/s    in 0s
+models/gpt-2-117M/model.ckpt.meta                 100%[=============================>] 460.11K   806KB/s    in 0.6s
+models/gpt-2-117M/vocab.bpe                       100%[=============================>] 445.62K   799KB/s    in 0.6s
+Done! Model '117M' saved in 'models/gpt-2-117M/'
+
+Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.
+
+  python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1
+
+```
+
+This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
+this, you can download the already converted ggml models as described below.
+
+## Downloading and converting the original models (Cerebras-GPT)
+
+Clone the respective repository from here: https://huggingface.co/cerebras
+
+Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:
+
+```
+cd ggml/build
+git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
+python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/
+
+```
+
+## Downloading the ggml model directly (GPT-2)
+
+For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
+way, you can directly download a single binary file and start using it. No python or Tensorflow is required.
+
+Here is how to get the 117M ggml model:
+
+```
+cd ggml/build
+../examples/gpt-2/download-ggml-model.sh 117M
+
+Downloading ggml model 117M ...
+models/gpt-2-117M/ggml-model.bin         100%[===============================>] 239.58M  8.52MB/s    in 28s
+Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
+You can now use it like this:
+
+  $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
+
+```
+
+At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.
+
+## Quantizing the models
+
+You can also try to quantize the `ggml` models via 4-bit integer quantization.
+Keep in mind that for smaller models, this will render them completely useless.
+You generally want to quantize larger models.
+
+```
+# quantize GPT-2 F16 to Q4_0 (faster but less precise)
+./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
+./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"
+
+# quantize Cerebras F16 to Q4_1 (slower but more precise)
+./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
+./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"
+
+```
diff --git a/examples/gpt-2-sparse/convert-cerebras-to-ggml.py b/examples/gpt-2-sparse/convert-cerebras-to-ggml.py
new file mode 100644
index 00000000..6057f81c
--- /dev/null
+++ b/examples/gpt-2-sparse/convert-cerebras-to-ggml.py
@@ -0,0 +1,183 @@
+# Convert Cerebras models to ggml format
+#
+# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
+#
+
+import sys
+import struct
+import json
+import torch
+import numpy as np
+import re
+
+from transformers import AutoModelForCausalLM
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+if len(sys.argv) < 2:
+    print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model-f16.bin"
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 2:
+    use_f16 = False
+    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
+
+model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
+#print (model)
+
+list_vars = model.state_dict()
+#print (list_vars)
+
+print(hparams)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["n_positions"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+fout.write(struct.pack("i", use_f16))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(encoder)))
+
+for key in encoder:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # rename headers to keep compatibility
+    if name == "transformer.ln_f.weight":
+        name = "model/ln_f/g"
+    elif name == "transformer.ln_f.bias":
+        name = "model/ln_f/b"
+    elif name == "transformer.wte.weight":
+        name = "model/wte"
+    elif name == "transformer.wpe.weight":
+        name = "model/wpe"
+    elif name == "lm_head.weight":
+        name = "model/lm_head"
+    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/g"
+    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/b"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/w"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/b"
+    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/w"
+    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/b"
+    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/g"
+    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/b"
+    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/w"
+    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/b"
+    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/w"
+    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/b"
+    else:
+        print("Unrecognized variable name. %s", name)
+
+    # we don't need these
+    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape);
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0;
+    if use_f16:
+        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+
+    # for efficiency - transpose the projection matrices
+    # "model/h.*/attn/c_attn/w"
+    # "model/h.*/attn/c_proj/w"
+    # "model/h.*/mlp/c_fc/w"
+    # "model/h.*/mlp/c_proj/w"
+    if name[-14:] == "/attn/c_attn/w" or \
+       name[-14:] == "/attn/c_proj/w" or \
+       name[-11:] == "/mlp/c_fc/w" or \
+       name[-13:] == "/mlp/c_proj/w":
+        print("  Transposing")
+        data = data.transpose()
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/examples/gpt-2-sparse/convert-ckpt-to-ggml.py b/examples/gpt-2-sparse/convert-ckpt-to-ggml.py
new file mode 100644
index 00000000..9113141f
--- /dev/null
+++ b/examples/gpt-2-sparse/convert-ckpt-to-ggml.py
@@ -0,0 +1,159 @@
+# Convert a model checkpoint to a ggml compatible file
+#
+# Load the model using TensorFlow.
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# By default, the bigger matrices are converted to 16-bit floats.
+# This can be disabled by adding the "use-f32" CLI argument.
+#
+# At the start of the ggml file we write the model parameters
+# and vocabulary.
+#
+
+import sys
+import json
+import struct
+import numpy as np
+import tensorflow as tf
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+# helper method to convert a numpy array to different float types
+def convert_to_ftype(data, ftype):
+    # fp16
+    if ftype == 1:
+        return data.astype(np.float16)
+
+    assert False, "Invalid ftype: " + str(ftype)
+
+if len(sys.argv) < 3:
+    print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+
+list_vars = tf.train.list_variables(dir_model)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["n_vocab"]))
+fout.write(struct.pack("i", hparams["n_ctx"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+fout.write(struct.pack("i", ftype))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(encoder)))
+
+for key in encoder:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name, shape in list_vars:
+    print("Processing variable: " + name + " with shape: ", shape)
+
+    data = tf.train.load_variable(dir_model, name).squeeze()
+    n_dims = len(data.shape);
+
+    # for efficiency - transpose the projection matrices
+    # "model/h.*/attn/c_attn/w"
+    # "model/h.*/attn/c_proj/w"
+    # "model/h.*/mlp/c_fc/w"
+    # "model/h.*/mlp/c_proj/w"
+    if name[-14:] == "/attn/c_attn/w" or \
+       name[-14:] == "/attn/c_proj/w" or \
+       name[-11:] == "/mlp/c_fc/w" or \
+       name[-13:] == "/mlp/c_proj/w":
+        print("  Transposing")
+        data = data.transpose()
+
+    dshape = data.shape
+
+    ftype_cur = 0
+    if ftype != 0:
+        # match name:
+        #  "model/wte"
+        #  "model/h.*/attn/c_attn/w"
+        #  "model/h.*/attn/c_proj/w"
+        #  "model/h.*/mlp/c_fc/w"
+        #  "model/h.*/mlp/c_proj/w"
+        if name == "model/wte" or name[-2:] == "/w":
+            print("  Converting to " + ftype_str[ftype])
+            data = convert_to_ftype(data, ftype)
+            ftype_cur = ftype
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    # header
+    str = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/examples/gpt-2-sparse/convert-h5-to-ggml.py b/examples/gpt-2-sparse/convert-h5-to-ggml.py
new file mode 100644
index 00000000..6a2b8654
--- /dev/null
+++ b/examples/gpt-2-sparse/convert-h5-to-ggml.py
@@ -0,0 +1,195 @@
+# Convert GPT-2 h5 transformer model to ggml format
+#
+# Load the model using GPT2Model.
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# By default, the bigger matrices are converted to 16-bit floats.
+# This can be disabled by adding the "use-f32" CLI argument.
+#
+# At the start of the ggml file we write the model parameters
+# and vocabulary.
+#
+
+import sys
+import struct
+import json
+import numpy as np
+import re
+
+from transformers import GPT2Model
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+if len(sys.argv) < 2:
+    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+fname_out = sys.argv[1] + "/ggml-model.bin"
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    encoder = json.load(f)
+
+with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+    encoder_added = json.load(f)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 2:
+    use_f16 = False
+    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
+
+model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
+#print (model)
+
+list_vars = model.state_dict()
+#print (list_vars)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["n_positions"]))
+fout.write(struct.pack("i", hparams["n_embd"]))
+fout.write(struct.pack("i", hparams["n_head"]))
+fout.write(struct.pack("i", hparams["n_layer"]))
+#fout.write(struct.pack("i", hparams["rotary_dim"]))
+fout.write(struct.pack("i", use_f16))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
+
+for key in encoder:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for key in encoder_added:
+    text = bytearray([byte_decoder[c] for c in key])
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape)
+
+    # we don't need these
+    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+        print("  Skipping variable: " + name)
+        continue
+
+    n_dims = len(data.shape);
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 0;
+    if use_f16:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+
+    # for efficiency - transpose these matrices:
+    #  "transformer.h.*.mlp.c_proj.weight
+    if name.endswith(".mlp.c_proj.weight"):
+        print("  Transposing")
+        data = data.transpose()
+
+    # rename headers to keep compatibility
+    if name == "ln_f.weight":
+        name = "model/ln_f/g"
+    elif name == "ln_f.bias":
+        name = "model/ln_f/b"
+    elif name == "wte.weight":
+        name = "model/wte"
+    elif name == "wpe.weight":
+        name = "model/wpe"
+    elif re.match(r"h\.\d+\.ln_1\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/g"
+    elif re.match(r"h\.\d+\.ln_1\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_1/b"
+    elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/w"
+    elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_attn/b"
+    elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/w"
+    elif re.match(r"h.\d+.attn.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/attn/c_proj/b"
+    elif re.match(r"h.\d+.ln_2.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/g"
+    elif re.match(r"h.\d+.ln_2.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/ln_2/b"
+    elif re.match(r"h.\d+.mlp.c_fc.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/w"
+    elif re.match(r"h.\d+.mlp.c_fc.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_fc/b"
+    elif re.match(r"h.\d+.mlp.c_proj.weight", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/w"
+    elif re.match(r"h.\d+.mlp.c_proj.bias", name):
+        i = re.findall("\d+", name)[0]
+        name = f"model/h{i}/mlp/c_proj/b"
+    else:
+        print("Unrecognized variable name. %s", name)
+
+    str = name.encode('utf-8')
+
+    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str);
+
+    # data
+    data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/examples/gpt-2-sparse/download-ggml-model.sh b/examples/gpt-2-sparse/download-ggml-model.sh
new file mode 100755
index 00000000..3aae015b
--- /dev/null
+++ b/examples/gpt-2-sparse/download-ggml-model.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# This script downloads GPT-2 model files that have already been converted to ggml format.
+# This way you don't have to convert them yourself.
+#
+# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.
+
+#src="https://ggml.ggerganov.com"
+#pfx="ggml-model-gpt-2"
+
+src="https://huggingface.co/ggerganov/ggml"
+pfx="resolve/main/ggml-model-gpt-2"
+
+ggml_path=$(dirname $(realpath $0))
+
+# GPT-2 models
+models=( "117M" "345M" "774M" "1558M" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ "$#" -ne 1 ]; then
+    printf "Usage: $0 <model>\n"
+    list_models
+
+    exit 1
+fi
+
+model=$1
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+# download ggml model
+
+printf "Downloading ggml model $model ...\n"
+
+mkdir -p models/gpt-2-$model
+
+if [ -x "$(command -v wget)" ]; then
+    wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
+elif [ -x "$(command -v curl)" ]; then
+    curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
+else
+    printf "Either wget or curl is required to download models.\n"
+    exit 1
+fi
+
+if [ $? -ne 0 ]; then
+    printf "Failed to download ggml model $model \n"
+    printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
+    exit 1
+fi
+
+printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
+printf "You can now use it like this:\n\n"
+printf "  $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
+printf "\n"
diff --git a/examples/gpt-2-sparse/download-model.sh b/examples/gpt-2-sparse/download-model.sh
new file mode 100755
index 00000000..f0c62f4f
--- /dev/null
+++ b/examples/gpt-2-sparse/download-model.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+ggml_path=$(dirname $(realpath $0))
+
+# GPT-2 models
+models=( "117M" "345M" "774M" "1558M" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ "$#" -ne 1 ]; then
+    printf "Usage: $0 <model>\n"
+    list_models
+
+    exit 1
+fi
+
+model=$1
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+# download model
+
+printf "Downloading model $model ...\n"
+
+mkdir -p models/gpt-2-$model
+
+for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
+    wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
+done
+
+printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
+printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
+printf "\n"
+printf "  python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
+printf "\n"
diff --git a/examples/gpt-2-sparse/main-30b.cpp b/examples/gpt-2-sparse/main-30b.cpp
new file mode 100644
index 00000000..73eeff25
--- /dev/null
+++ b/examples/gpt-2-sparse/main-30b.cpp
@@ -0,0 +1,1593 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include <regex>
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "ggml-cuda.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+typedef void (*offload_func_t)(struct ggml_tensor * tensor);
+void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
+    (void) tensor;
+}
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    // struct ggml_tensor * c_attn_attn_w;
+    // struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_attn_q_w;
+    struct ggml_tensor * c_attn_attn_q_b;
+
+    struct ggml_tensor * c_attn_attn_k_w;
+    struct ggml_tensor * c_attn_attn_k_b;
+
+    struct ggml_tensor * c_attn_attn_v_w;
+    struct ggml_tensor * c_attn_attn_v_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+
+    struct ggml_tensor * gpu_idx;
+    struct ggml_tensor * gpu_bucket;
+    // gpu heat
+    struct ggml_tensor * c_mlp_fc_w_gpu;
+    struct ggml_tensor * c_mlp_proj_w_t;
+    struct ggml_tensor * c_mlp_proj_w_gpu;
+
+    //predictor
+    struct ggml_tensor * mlp_pre_w1_w;
+    struct ggml_tensor * mlp_pre_w2_w;
+};
+
+struct opt_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    opt_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error("opt_file fail\n");
+		}
+		seek(0, SEEK_END);
+		size = tell();
+		seek(0, SEEK_SET);
+    }
+	size_t tell() const {
+#ifdef _WIN32
+		__int64 ret = _ftelli64(fp);
+#else
+		long ret = std::ftell(fp);
+#endif
+		GGML_ASSERT(ret != -1); // this really shouldn't fail
+		return (size_t) ret;
+	}
+
+	void seek(size_t offset, int whence) {
+#ifdef _WIN32
+		int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+		int ret = std::fseek(fp, (long) offset, whence);
+#endif
+		GGML_ASSERT(ret == 0); // same
+	}
+
+    ~opt_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+#define _POSIX_MAPPED_FILES
+#include <sys/types.h>
+#include <sys/mman.h>
+
+struct opt_mmap {
+    void * addr;
+    size_t size;
+
+    opt_mmap(const opt_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+        // prefetch/readahead impairs performance on NUMA systems
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error("mmap failed\n");
+        }
+
+        if (prefetch > 0) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+        if (numa) {
+            // advise the kernel not to use readahead
+            // (because the next page might not belong on the same node)
+            if (madvise(addr, file->size, MADV_RANDOM)) {
+                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~opt_mmap() {
+        munmap(addr, size);
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
+        (void) prefetch;
+        (void) numa;
+
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
+#endif
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+    struct opt_file * file;
+    struct opt_mmap * mapping;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor **> tensors;
+};
+
+struct ggml_context * ctx0 = nullptr;
+// std::vector<uint8_t> compute_buffer;
+void *compute_buffer;
+
+bool endsWith(const std::string& str, const std::string& suffix) {
+    if (str.length() < suffix.length()) {
+        return false;
+    }
+    return str.substr(str.length() - suffix.length()) == suffix;
+}
+
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+    model.file = new opt_file(fname.c_str(), "rb");
+    printf("size %d\n", model.file->size);
+    model.mapping = new opt_mmap(model.file, 0, false);
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        /* int32_t n_vocab = 0; */
+        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
+
+        /* if (n_vocab != model.hparams.n_vocab) { */
+        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
+        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
+        /*     return false; */
+        /* } */
+        int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+    printf("wtype %d\n", wtype);
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        //need refactor
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
+        //predictor
+        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size = 0;
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*51200; // object overhead
+
+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+    int main_gpu = 0;
+#if defined(GGML_USE_CUBLAS)
+    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    ggml_cuda_set_main_device(main_gpu);
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+#else
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+#endif
+    
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
+        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
+
+        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
+        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        
+        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
+
+        // map by name
+        model.tensors["output_norm.weight"] = &model.ln_f_g;
+        model.tensors["output_norm.bias"] = &model.ln_f_b;
+
+        model.tensors["tok_embeddings.weight"]     = &model.wte;
+        model.tensors["pos_embeddings.weight"]     = &model.wpe;
+        model.tensors["output.weight"] = &model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+            memset(&layer, 0, sizeof(gpt2_layer));
+
+        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+        //     // need refine
+        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
+        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
+        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
+
+        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
+        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
+
+        //     if (i <= 10) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
+        //     } else if (i <= 12) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
+        //     } else if (i <= 18) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
+
+        //     } else if (i <= 21) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
+        //     } else if (i <= 26) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
+        //     } else if (i <= 31) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
+        //     }
+
+        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
+
+        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
+
+            // map by name
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
+
+            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
+            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
+            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
+            
+            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
+            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
+        }
+    }
+
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        #ifdef GGML_USE_CUBLAS
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
+        #endif
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+    ggml_set_no_alloc(ctx, true);
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+        const std::vector<std::string> to_gpu = {
+                "output_norm.bias",
+                "output_norm.weight",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                // ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                "layers.*output_norm.weight",
+                "layers.*output_norm.bias",
+                ".*fc1.weight",
+                ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_gpu_lv = {
+                // ".*attention.wq.weight",
+                // ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                // "layers.*output_norm.weight",
+                // "layers.*output_norm.bias",
+                ".*fc1.weight",
+                ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_lock = {
+                "tok_embeddings.weight",
+                "pos_embeddings.weight",
+                // "output_norm.bias",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                // ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                // ".*attention.wo.bias",
+                ".*feed_forward.w1.weight",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_transpose",
+                // ".*feed_forward.w2.weight",
+                ".*feed_forward.w2.bias",
+                ".*gpu.weight",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                ".*output_norm.weight",
+                ".*output_norm.bias",
+                ".*attention.*fc1.weight",
+                ".*attention.*fc1.bias",
+                ".*attention.*fc2.weight",
+                ".*attention.*fc2.bias",
+                // ".*w2.bias",
+                // ".*w1.bias",
+                "output.weight",
+            };
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            int64_t new_ne[2];
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+                new_ne[i] = ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+            ggml_tensor ** ptr = model.tensors[name];
+            // printf("name %s ptr %p\n", name.c_str(), *ptr);
+            // int k;
+            // scanf("%d", &k);
+            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
+
+            auto tensor = (ggml_tensor *)*model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            
+
+            // for debugging
+            if (1) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            std::streampos offset = fin.tellg();
+            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
+            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
+            // if ( endsWith(name.c_str(), "weight_transpose")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i+4096]);
+            //     }
+            // }
+            // printf("\n");
+            // if (endsWith(name.c_str(), "weight_h20")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i]);
+
+            //     }
+            //     int k;
+            //     scanf("%d", &k);
+            // }
+
+            // // GPT-2 models share the WTE tensor as the LM head
+            // if (name == "model/wte" && has_lm_head == false) {
+            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            // }
+
+            // if (name == "model/lm_head") {
+            //     has_lm_head = true;
+            // }
+            if (model_params.low_vram == false) {
+                for (const auto &s : to_gpu)
+                {
+                    // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight")))
+                    // {
+                    //     std::regex pattern(R"(\d+)");
+                    //     std::smatch match;
+                    //     int layer_id = 0;
+                    //     if (std::regex_search(name, match, pattern))
+                    //     {
+                    //         std::string digitStr = match.str();
+                    //         int num = std::stoi(digitStr);
+                    //         layer_id = num;
+                    //     }
+                    //     printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                    //     if (layer_id > model_params.n_gpu_layers)
+                    //         break;
+                    // }
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+            } else {
+                for (const auto &s : to_gpu_lv)
+                {
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        std::regex pattern(R"(\d+)");
+                        std::smatch match;
+                        int layer_id = 0;
+                        if (std::regex_search(name, match, pattern))
+                        {
+                            std::string digitStr = match.str();
+                            int num = std::stoi(digitStr);
+                            layer_id = num;
+                        }
+                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                        if (layer_id > model_params.n_gpu_layers)
+                            break;
+                        // printf("name %s\n", name.c_str());
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+
+            }
+            if (tensor->backend == GGML_BACKEND_GPU) {
+                #if defined(GGML_USE_CUBLAS)
+                ggml_cuda_transform_tensor(tensor->data, tensor);
+                #endif
+            }
+            for (const auto &s : to_lock)
+            {
+                if (std::regex_match(name, std::regex(s)))
+                {
+                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
+                        // printf("mlock %s\n", name.c_str());
+                    }
+                    else {
+                        printf("mlock failed %s\n", name.c_str());
+                    }
+                }
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+        ggml_set_no_alloc(ctx, false);
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+    printf("load finish\n");
+    // int k;
+    // scanf("%d", &k);
+
+    fin.close();
+
+    return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    // static std::vector<uint8_t> buf(buf_size);
+    static void * buf = ggml_cuda_host_malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    ctx0 = ggml_init(params);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, embd);
+
+    // avoid writing to tensors if we are only measuring the memory usage
+    if (!ggml_allocr_is_measure(allocr)) {
+        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            ((int32_t *) position->data)[i] = n_past + i + 2;
+        }
+    }
+    offload_func_t offload_func = opt_nop;
+    offload_func_t offload_func_kq = opt_nop;
+    offload_func_t offload_func_v = opt_nop;
+    offload_func_t offload_func_nr = opt_nop;
+    offload_func_t offload_debug = opt_nop;
+#ifdef GGML_USE_CUBLAS
+    offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
+#endif
+    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // int k; 
+    // scanf("%d", &k); 
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+    ggml_set_name(inpL, "inpL_first");
+    // offload_func(inpL);
+
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            offload_func(cur);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_mul(ctx0,
+                        cur,
+                        model.layers[il].ln_1_g);
+            offload_func(cur);
+            ggml_set_name(cur, "ln_1_g");
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].ln_1_b);
+            ggml_set_name(cur, "ln_1_b");
+            // offload_func(cur);
+            
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+
+        struct ggml_tensor *k_cpy = nullptr;
+        struct ggml_tensor *v_cpy = nullptr;
+        // self-attention
+        {
+            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
+            offload_func_kq(Qcur);
+            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
+            offload_func_kq(Qcur);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
+            offload_func_kq(Kcur);
+            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
+            offload_func_kq(Kcur);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
+            offload_func_v(Vcur);
+            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
+            offload_func_v(Vcur);
+
+            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+            offload_func_v(Vcur);
+
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                offload_func_kq(k);
+                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
+
+                offload_func_v(v);
+                k_cpy = ggml_cpy(ctx0, Kcur, k);
+                offload_func_kq(k_cpy);
+                ggml_set_name(k_cpy, "k_cpy");
+                v_cpy = ggml_cpy(ctx0, Vcur, v);
+                offload_func_v(v_cpy);
+                ggml_set_name(v_cpy, "v_cpy");
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
+            offload_func_kq(Qcur);
+             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            ggml_set_name(Q, "Q");
+            offload_func_kq(Q);
+
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            // struct ggml_tensor * K =
+            //     ggml_permute(ctx0,
+            //             ggml_reshape_3d(ctx0,
+            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            //                 n_embd/n_head, n_head, n_past + N),
+            //             0, 2, 1, 3);
+            
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, model.memory_k,
+                        128, n_past + N, n_head,
+                        ggml_element_size(model.memory_k)*n_embd,
+                        ggml_element_size(model.memory_k)*128,
+                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
+            K->src[1] = k_cpy;
+            offload_func_kq(K);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            offload_func_kq(KQ);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        KQ_scale);
+            offload_func_kq(KQ_scaled);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            offload_func_kq(KQ_masked);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            offload_func_v(KQ_soft_max);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, 128, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*128,
+                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
+            V->src[1] = v_cpy;
+            offload_func_v(V);
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            offload_func_v(KQV);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            offload_func_v(KQV_merged);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            ggml_set_name(cur, "KQV_merge_cont");
+            offload_func_v(cur);
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+            ggml_set_name(cur, "attn_proj");
+            offload_func(cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_proj_b);
+            ggml_set_name(cur, "attn_bias");
+            offload_func(cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+        offload_func(cur);
+        ggml_set_name(cur, "after attn");
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            ggml_tensor *idx = nullptr;
+            ggml_tensor *idx_g = nullptr;
+            ggml_tensor *cur_c = nullptr;
+            
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN");
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_mul(ctx0,
+                            cur,
+                            model.layers[il].ln_2_g);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN_g");
+                cur = ggml_add(ctx0,
+                        cur, 
+                        model.layers[il].ln_2_b);
+                // offload_func(cur);
+                // ggml_set_name(cur, "norm_FFN_w");
+                // cur_c = ggml_dup(ctx0, cur);
+            }
+            // if (N == 1)
+            if (1)
+            {
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w1_w,
+                                   inpFF);
+                offload_func(idx);
+                ggml_set_name(idx, "mlp_pre_w1");
+                idx = ggml_relu(ctx0, idx);
+                offload_func(idx);
+                ggml_set_name(idx, "relu_pre");
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w2_w,
+                                   idx);
+                ggml_set_name(idx, "mlp_pre_w2");
+                // offload_func(idx);
+                // idx = ggml_sigmoid(ctx0, idx);
+                // offload_func(idx);
+                // idx_g = idx;
+                // idx = ggml_dup(ctx0, idx_g);
+                // ggml_set_name(idx, "idx_cpu_dup");
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            if (N >= 80)
+            // if (0)
+            {
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_fc_w,
+                                   cur);
+                offload_debug(cur);
+                offload_func(cur);
+                ggml_set_name(cur, "up_ffn");
+                cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
+                offload_debug(cur);
+                offload_func(cur);
+            }
+            else 
+            {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_fc_w,
+                //                    cur);
+                // offload_func(cur);
+                // cur = ggml_add(ctx0,
+                //     cur,
+                //     model.layers[il].c_mlp_fc_b);
+                // offload_func(cur);
+
+                
+                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
+                model.layers[il].c_mlp_fc_w_gpu,
+                cur,
+                idx,
+                model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "mlp_up_gpu");
+                offload_func(tmp);
+                offload_debug(tmp);
+                cur = ggml_mul_mat_idx(ctx0,
+                                       model.layers[il].c_mlp_fc_w,
+                                       cur,
+                                       idx,
+                                       model.layers[il].gpu_idx);
+                ggml_set_name(cur, "mlp_up_cpu");
+                cur = ggml_add_idx(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b,
+                    idx);
+                ggml_set_name(tmp, "mlp_up_bias");
+                offload_debug(tmp);
+                offload_func(tmp);
+
+            cur = ggml_add(ctx0, cur, tmp);
+            ggml_set_name(cur, "mlp_up_mix");
+            offload_func(cur);
+
+                // cur = tmp;
+
+            }
+
+            
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_relu(ctx0, cur);
+            // cur_c = cur;
+            // offload_func(cur);
+            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            if (N >= 80) {
+            // if (0) { 
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_proj_w,
+                //                    cur);
+                cur = ggml_axpy(ctx0,
+                                   model.layers[il].c_mlp_proj_w_t,
+                                   cur,
+                                   NULL,
+                                   NULL);
+                offload_debug(cur);
+                offload_func(cur);
+                ggml_set_name(cur, "down_ffn");
+
+                cur = ggml_add(ctx0,
+                               cur,
+                               model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+                offload_debug(cur);
+            }
+            else {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_proj_w,
+                //                    cur);
+                // offload_func(cur);
+                
+                // cur = ggml_axpy(ctx0, 
+                // model.layers[il].c_mlp_proj_w_t,
+                // cur,
+                // NULL,
+                // NULL);
+                // offload_func(cur);
+
+
+                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
+                // model.layers[il].c_mlp_proj_w_gpu,
+                // cur,
+                // model.layers[il].gpu_bucket,
+                // NULL);
+                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
+                    model.layers[il].c_mlp_proj_w_gpu,
+                    cur,
+                    idx,
+                    model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "axpy");
+                offload_func(tmp);
+                offload_debug(tmp);
+                cur = ggml_axpy(ctx0, 
+                model.layers[il].c_mlp_proj_w_t,
+                cur_c,
+                idx,
+                model.layers[il].gpu_idx);
+
+                cur = ggml_add(ctx0, cur, tmp);
+                offload_func(cur);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+                
+                // tmp = ggml_add(ctx0,
+                //                tmp,
+                //                model.layers[il].c_mlp_proj_b);
+                // offload_func(tmp);
+                // offload_debug(tmp);
+
+                // cur = tmp;
+            }
+            
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+        offload_func(inpL);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        offload_func_nr(inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_mul(ctx0,
+                    inpL,
+                    model.ln_f_g);
+        offload_func_nr(inpL);
+        inpL = ggml_add(ctx0,
+                inpL,
+                model.ln_f_b);
+        ggml_set_name(inpL, "before");
+        offload_func_nr(inpL);
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    ggml_set_name(inpL, "last_layer");
+// offload_func(inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - allocr:    ggml_allocr to use to allocate the compute buffer
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_vocab;
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+#ifdef GGML_USE_CUBLAS
+    for (int i = 0; i < gf->n_leafs; i++) {
+        ggml_tensor * node = gf->leafs[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        ggml_tensor * node = gf->nodes[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+#endif
+
+
+
+    // run the computation
+    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> work_buffer;
+    work_buffer.resize(plan.work_size);
+    plan.work_data = work_buffer.data();
+    ggml_graph_compute(gf, &plan);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    // in this case, the output tensor is the last one in the graph
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-2-117M/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(params.model, model, vocab, params)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, "hello world");
+    }
+    printf("load finish\n");
+
+    // keep this buffer alive while evaluating the model
+
+    struct ggml_allocr * allocr = NULL;
+    // allocate the compute buffer
+    {
+        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+
+        // create the worst case graph for memory usage estimation
+        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
+        int n_past = model.hparams.n_ctx - n_tokens;
+        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(allocr);
+        // compute_buffer.resize(mem_size);
+        compute_buffer = ggml_cuda_host_malloc(mem_size);
+        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
+
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+        printf("%d ", embd_inp[i]);
+    }
+    printf("\n\n");
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    int cnt = 0;
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+            cnt += 1;
+
+            if (cnt > 0)
+                t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            llama_sampling_params & sparams = params.sparams;
+            const int   top_k = sparams.top_k;
+            const float top_p = sparams.top_p;
+            const float temp  = sparams.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 50256) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/examples/gpt-2-sparse/main.cpp_123 b/examples/gpt-2-sparse/main.cpp_123
new file mode 100644
index 00000000..4deed1df
--- /dev/null
+++ b/examples/gpt-2-sparse/main.cpp_123
@@ -0,0 +1,1592 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include <regex>
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "ggml-cuda.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+typedef void (*offload_func_t)(struct ggml_tensor * tensor);
+void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
+    (void) tensor;
+}
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    // struct ggml_tensor * c_attn_attn_w;
+    // struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_attn_q_w;
+    struct ggml_tensor * c_attn_attn_q_b;
+
+    struct ggml_tensor * c_attn_attn_k_w;
+    struct ggml_tensor * c_attn_attn_k_b;
+
+    struct ggml_tensor * c_attn_attn_v_w;
+    struct ggml_tensor * c_attn_attn_v_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+
+    struct ggml_tensor * gpu_idx;
+    struct ggml_tensor * gpu_bucket;
+    // gpu heat
+    struct ggml_tensor * c_mlp_fc_w_gpu;
+    struct ggml_tensor * c_mlp_proj_w_t;
+    struct ggml_tensor * c_mlp_proj_w_gpu;
+
+    //predictor
+    struct ggml_tensor * mlp_pre_w1_w;
+    struct ggml_tensor * mlp_pre_w2_w;
+};
+
+struct opt_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    opt_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error("opt_file fail\n");
+		}
+		seek(0, SEEK_END);
+		size = tell();
+		seek(0, SEEK_SET);
+    }
+	size_t tell() const {
+#ifdef _WIN32
+		__int64 ret = _ftelli64(fp);
+#else
+		long ret = std::ftell(fp);
+#endif
+		GGML_ASSERT(ret != -1); // this really shouldn't fail
+		return (size_t) ret;
+	}
+
+	void seek(size_t offset, int whence) {
+#ifdef _WIN32
+		int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+		int ret = std::fseek(fp, (long) offset, whence);
+#endif
+		GGML_ASSERT(ret == 0); // same
+	}
+
+    ~opt_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+#define _POSIX_MAPPED_FILES
+#include <sys/types.h>
+#include <sys/mman.h>
+
+struct opt_mmap {
+    void * addr;
+    size_t size;
+
+    opt_mmap(const opt_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+        // prefetch/readahead impairs performance on NUMA systems
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error("mmap failed\n");
+        }
+
+        if (prefetch > 0) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+        if (numa) {
+            // advise the kernel not to use readahead
+            // (because the next page might not belong on the same node)
+            if (madvise(addr, file->size, MADV_RANDOM)) {
+                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~opt_mmap() {
+        munmap(addr, size);
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
+        (void) prefetch;
+        (void) numa;
+
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
+#endif
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+    struct opt_file * file;
+    struct opt_mmap * mapping;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor **> tensors;
+};
+
+struct ggml_context * ctx0 = nullptr;
+// std::vector<uint8_t> compute_buffer;
+void *compute_buffer;
+
+bool endsWith(const std::string& str, const std::string& suffix) {
+    if (str.length() < suffix.length()) {
+        return false;
+    }
+    return str.substr(str.length() - suffix.length()) == suffix;
+}
+
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+    model.file = new opt_file(fname.c_str(), "rb");
+    printf("size %d\n", model.file->size);
+    model.mapping = new opt_mmap(model.file, 0, false);
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        /* int32_t n_vocab = 0; */
+        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
+
+        /* if (n_vocab != model.hparams.n_vocab) { */
+        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
+        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
+        /*     return false; */
+        /* } */
+        int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+    printf("wtype %d\n", wtype);
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        //need refactor
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
+        //predictor
+        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size = 0;
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*51200; // object overhead
+
+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+    int main_gpu = 0;
+#if defined(GGML_USE_CUBLAS)
+    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    ggml_cuda_set_main_device(main_gpu);
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+#else
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+#endif
+    
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
+        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
+
+        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
+        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        
+        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
+
+        // map by name
+        model.tensors["output_norm.weight"] = &model.ln_f_g;
+        model.tensors["output_norm.bias"] = &model.ln_f_b;
+
+        model.tensors["tok_embeddings.weight"]     = &model.wte;
+        model.tensors["pos_embeddings.weight"]     = &model.wpe;
+        model.tensors["output.weight"] = &model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+            memset(&layer, 0, sizeof(gpt2_layer));
+
+        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+        //     // need refine
+        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
+        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
+        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
+
+        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
+        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
+
+        //     if (i <= 10) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
+        //     } else if (i <= 12) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
+        //     } else if (i <= 18) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
+
+        //     } else if (i <= 21) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
+        //     } else if (i <= 26) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
+        //     } else if (i <= 31) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
+        //     }
+
+        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
+
+        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
+
+            // map by name
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
+
+            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
+            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
+            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
+            
+            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
+            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
+        }
+    }
+
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        #ifdef GGML_USE_CUBLAS
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
+        #endif
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+    ggml_set_no_alloc(ctx, true);
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+        const std::vector<std::string> to_gpu = {
+                "output_norm.bias",
+                "output_norm.weight",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                // ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                "layers.*output_norm.weight",
+                "layers.*output_norm.bias",
+                ".*fc1.weight",
+                ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_gpu_lv = {
+                // ".*attention.wq.weight",
+                // ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                // "layers.*output_norm.weight",
+                // "layers.*output_norm.bias",
+                ".*fc1.weight",
+                ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_lock = {
+                "tok_embeddings.weight",
+                "pos_embeddings.weight",
+                // "output_norm.bias",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                // ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                // ".*attention.wo.bias",
+                ".*feed_forward.w1.weight",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_transpose",
+                // ".*feed_forward.w2.weight",
+                ".*feed_forward.w2.bias",
+                ".*gpu.weight",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                ".*output_norm.weight",
+                ".*output_norm.bias",
+                ".*attention.*fc1.weight",
+                ".*attention.*fc1.bias",
+                ".*attention.*fc2.weight",
+                ".*attention.*fc2.bias",
+                // ".*w2.bias",
+                // ".*w1.bias",
+                "output.weight",
+            };
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            int64_t new_ne[2];
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+                new_ne[i] = ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+            ggml_tensor ** ptr = model.tensors[name];
+            // printf("name %s ptr %p\n", name.c_str(), *ptr);
+            // int k;
+            // scanf("%d", &k);
+            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
+
+            auto tensor = (ggml_tensor *)*model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            
+
+            // for debugging
+            if (1) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            std::streampos offset = fin.tellg();
+            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
+            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
+            // if ( endsWith(name.c_str(), "weight_transpose")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i+4096]);
+            //     }
+            // }
+            // printf("\n");
+            // if (endsWith(name.c_str(), "weight_h20")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i]);
+
+            //     }
+            //     int k;
+            //     scanf("%d", &k);
+            // }
+
+            // // GPT-2 models share the WTE tensor as the LM head
+            // if (name == "model/wte" && has_lm_head == false) {
+            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            // }
+
+            // if (name == "model/lm_head") {
+            //     has_lm_head = true;
+            // }
+            if (model_params.low_vram == false) {
+                for (const auto &s : to_gpu)
+                {
+                    // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight")))
+                    // {
+                    //     std::regex pattern(R"(\d+)");
+                    //     std::smatch match;
+                    //     int layer_id = 0;
+                    //     if (std::regex_search(name, match, pattern))
+                    //     {
+                    //         std::string digitStr = match.str();
+                    //         int num = std::stoi(digitStr);
+                    //         layer_id = num;
+                    //     }
+                    //     printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                    //     if (layer_id > model_params.n_gpu_layers)
+                    //         break;
+                    // }
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+            } else {
+                for (const auto &s : to_gpu_lv)
+                {
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        std::regex pattern(R"(\d+)");
+                        std::smatch match;
+                        int layer_id = 0;
+                        if (std::regex_search(name, match, pattern))
+                        {
+                            std::string digitStr = match.str();
+                            int num = std::stoi(digitStr);
+                            layer_id = num;
+                        }
+                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                        if (layer_id > model_params.n_gpu_layers)
+                            break;
+                        // printf("name %s\n", name.c_str());
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+
+            }
+            if (tensor->backend == GGML_BACKEND_GPU) {
+                #if defined(GGML_USE_CUBLAS)
+                ggml_cuda_transform_tensor(tensor->data, tensor);
+                #endif
+            }
+            for (const auto &s : to_lock)
+            {
+                if (std::regex_match(name, std::regex(s)))
+                {
+                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
+                        // printf("mlock %s\n", name.c_str());
+                    }
+                    else {
+                        printf("mlock failed %s\n", name.c_str());
+                    }
+                }
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+        ggml_set_no_alloc(ctx, false);
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+    printf("load finish\n");
+    // int k;
+    // scanf("%d", &k);
+
+    fin.close();
+
+    return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    // static std::vector<uint8_t> buf(buf_size);
+    static void * buf = ggml_cuda_host_malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    ctx0 = ggml_init(params);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, embd);
+
+    // avoid writing to tensors if we are only measuring the memory usage
+    if (!ggml_allocr_is_measure(allocr)) {
+        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            ((int32_t *) position->data)[i] = n_past + i + 2;
+        }
+    }
+    offload_func_t offload_func = opt_nop;
+    offload_func_t offload_func_kq = opt_nop;
+    offload_func_t offload_func_v = opt_nop;
+    offload_func_t offload_func_nr = opt_nop;
+    offload_func_t offload_debug = opt_nop;
+#ifdef GGML_USE_CUBLAS
+    offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
+#endif
+    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // int k; 
+    // scanf("%d", &k); 
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+    ggml_set_name(inpL, "inpL_first");
+    // offload_func(inpL);
+
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            offload_func(cur);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_mul(ctx0,
+                        cur,
+                        model.layers[il].ln_1_g);
+            offload_func(cur);
+            ggml_set_name(cur, "ln_1_g");
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].ln_1_b);
+            ggml_set_name(cur, "ln_1_b");
+            // offload_func(cur);
+            
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+
+        struct ggml_tensor *k_cpy = nullptr;
+        struct ggml_tensor *v_cpy = nullptr;
+        // self-attention
+        {
+            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
+            offload_func_kq(Qcur);
+            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
+            offload_func_kq(Qcur);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
+            offload_func_kq(Kcur);
+            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
+            offload_func_kq(Kcur);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
+            offload_func_v(Vcur);
+            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
+            offload_func_v(Vcur);
+
+            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+            offload_func_v(Vcur);
+
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                offload_func_kq(k);
+                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
+
+                offload_func_v(v);
+                k_cpy = ggml_cpy(ctx0, Kcur, k);
+                offload_func_kq(k_cpy);
+                ggml_set_name(k_cpy, "k_cpy");
+                v_cpy = ggml_cpy(ctx0, Vcur, v);
+                offload_func_v(v_cpy);
+                ggml_set_name(v_cpy, "v_cpy");
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
+            offload_func_kq(Qcur);
+             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            ggml_set_name(Q, "Q");
+            offload_func_kq(Q);
+
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            // struct ggml_tensor * K =
+            //     ggml_permute(ctx0,
+            //             ggml_reshape_3d(ctx0,
+            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            //                 n_embd/n_head, n_head, n_past + N),
+            //             0, 2, 1, 3);
+            
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, model.memory_k,
+                        128, n_past + N, n_head,
+                        ggml_element_size(model.memory_k)*n_embd,
+                        ggml_element_size(model.memory_k)*128,
+                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
+            K->src[1] = k_cpy;
+            offload_func_kq(K);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            offload_func_kq(KQ);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        KQ_scale);
+            offload_func_kq(KQ_scaled);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            offload_func_kq(KQ_masked);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            offload_func_v(KQ_soft_max);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, 128, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*128,
+                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
+            V->src[1] = v_cpy;
+            offload_func_v(V);
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            offload_func_v(KQV);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            offload_func_v(KQV_merged);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            ggml_set_name(cur, "KQV_merge_cont");
+            offload_func_v(cur);
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+            ggml_set_name(cur, "attn_proj");
+            offload_func(cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_proj_b);
+            ggml_set_name(cur, "attn_bias");
+            offload_func(cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+        offload_func(cur);
+        ggml_set_name(cur, "after attn");
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            ggml_tensor *idx = nullptr;
+            ggml_tensor *idx_g = nullptr;
+            ggml_tensor *cur_c = nullptr;
+            
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN");
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_mul(ctx0,
+                            cur,
+                            model.layers[il].ln_2_g);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN_g");
+                cur = ggml_add(ctx0,
+                        cur, 
+                        model.layers[il].ln_2_b);
+                // offload_func(cur);
+                // ggml_set_name(cur, "norm_FFN_w");
+                // cur_c = ggml_dup(ctx0, cur);
+            }
+            // if (N == 1)
+            if (1)
+            {
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w1_w,
+                                   inpFF);
+                offload_func(idx);
+                ggml_set_name(idx, "mlp_pre_w1");
+                idx = ggml_relu(ctx0, idx);
+                offload_func(idx);
+                ggml_set_name(idx, "relu_pre");
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w2_w,
+                                   idx);
+                ggml_set_name(idx, "mlp_pre_w2");
+                // offload_func(idx);
+                // idx = ggml_sigmoid(ctx0, idx);
+                // offload_func(idx);
+                // idx_g = idx;
+                // idx = ggml_dup(ctx0, idx_g);
+                // ggml_set_name(idx, "idx_cpu_dup");
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            if (N >= 80)
+            // if (0)
+            {
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_fc_w,
+                                   cur);
+                offload_debug(cur);
+                offload_func(cur);
+                ggml_set_name(cur, "up_ffn");
+                cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
+                offload_debug(cur);
+                offload_func(cur);
+            }
+            else 
+            {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_fc_w,
+                //                    cur);
+                // offload_func(cur);
+                // cur = ggml_add(ctx0,
+                //     cur,
+                //     model.layers[il].c_mlp_fc_b);
+                // offload_func(cur);
+
+                
+                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
+                model.layers[il].c_mlp_fc_w_gpu,
+                cur,
+                idx,
+                model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "mlp_up_gpu");
+                offload_func(tmp);
+                offload_debug(tmp);
+                cur = ggml_mul_mat_idx(ctx0,
+                                       model.layers[il].c_mlp_fc_w,
+                                       cur,
+                                       idx,
+                                       model.layers[il].gpu_idx);
+                ggml_set_name(cur, "mlp_up_cpu");
+                cur = ggml_add_idx(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b,
+                    idx);
+                ggml_set_name(tmp, "mlp_up_bias");
+                offload_debug(tmp);
+                offload_func(tmp);
+
+            cur = ggml_add(ctx0, cur, tmp);
+            ggml_set_name(cur, "mlp_up_mix");
+            offload_func(cur);
+
+                // cur = tmp;
+
+            }
+
+            
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_relu(ctx0, cur);
+            // cur_c = cur;
+            // offload_func(cur);
+            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            if (N >= 80) {
+            // if (0) { 
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_proj_w,
+                //                    cur);
+                cur = ggml_axpy(ctx0,
+                                   model.layers[il].c_mlp_proj_w_t,
+                                   cur,
+                                   NULL,
+                                   NULL);
+                offload_debug(cur);
+                offload_func(cur);
+                ggml_set_name(cur, "down_ffn");
+
+                cur = ggml_add(ctx0,
+                               cur,
+                               model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+                offload_debug(cur);
+            }
+            else {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_proj_w,
+                //                    cur);
+                // offload_func(cur);
+                
+                // cur = ggml_axpy(ctx0, 
+                // model.layers[il].c_mlp_proj_w_t,
+                // cur,
+                // NULL,
+                // NULL);
+                // offload_func(cur);
+
+
+                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
+                // model.layers[il].c_mlp_proj_w_gpu,
+                // cur,
+                // model.layers[il].gpu_bucket,
+                // NULL);
+                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
+                    model.layers[il].c_mlp_proj_w_gpu,
+                    cur,
+                    idx,
+                    model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "axpy");
+                offload_func(tmp);
+                offload_debug(tmp);
+                cur = ggml_axpy(ctx0, 
+                model.layers[il].c_mlp_proj_w_t,
+                cur_c,
+                idx,
+                model.layers[il].gpu_idx);
+
+                cur = ggml_add(ctx0, cur, tmp);
+                offload_func(cur);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+                
+                // tmp = ggml_add(ctx0,
+                //                tmp,
+                //                model.layers[il].c_mlp_proj_b);
+                // offload_func(tmp);
+                // offload_debug(tmp);
+
+                // cur = tmp;
+            }
+            
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+        offload_func(inpL);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        offload_func_nr(inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_mul(ctx0,
+                    inpL,
+                    model.ln_f_g);
+        offload_func_nr(inpL);
+        inpL = ggml_add(ctx0,
+                inpL,
+                model.ln_f_b);
+        ggml_set_name(inpL, "before");
+        offload_func_nr(inpL);
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    ggml_set_name(inpL, "last_layer");
+// offload_func(inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - allocr:    ggml_allocr to use to allocate the compute buffer
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_vocab;
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+#ifdef GGML_USE_CUBLAS
+    for (int i = 0; i < gf->n_leafs; i++) {
+        ggml_tensor * node = gf->leafs[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        ggml_tensor * node = gf->nodes[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+#endif
+
+
+
+    // run the computation
+    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> work_buffer;
+    work_buffer.resize(plan.work_size);
+    plan.work_data = work_buffer.data();
+    ggml_graph_compute(gf, &plan);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    // in this case, the output tensor is the last one in the graph
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-2-117M/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(params.model, model, vocab, params)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, "hello world");
+    }
+    printf("load finish\n");
+
+    // keep this buffer alive while evaluating the model
+
+    struct ggml_allocr * allocr = NULL;
+    // allocate the compute buffer
+    {
+        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+
+        // create the worst case graph for memory usage estimation
+        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
+        int n_past = model.hparams.n_ctx - n_tokens;
+        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(allocr);
+        // compute_buffer.resize(mem_size);
+        compute_buffer = ggml_cuda_host_malloc(mem_size);
+        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
+
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+        printf("%d ", embd_inp[i]);
+    }
+    printf("\n\n");
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    int cnt = 0;
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+            cnt += 1;
+
+            if (cnt > 0)
+                t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int   top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 50256) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/examples/gpt-2-sparse/main.cpp_bak b/examples/gpt-2-sparse/main.cpp_bak
new file mode 100644
index 00000000..e1e9d58e
--- /dev/null
+++ b/examples/gpt-2-sparse/main.cpp_bak
@@ -0,0 +1,1546 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include <regex>
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "ggml-cuda.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+typedef void (*offload_func_t)(struct ggml_tensor * tensor);
+void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
+    (void) tensor;
+}
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    // struct ggml_tensor * c_attn_attn_w;
+    // struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_attn_q_w;
+    struct ggml_tensor * c_attn_attn_q_b;
+
+    struct ggml_tensor * c_attn_attn_k_w;
+    struct ggml_tensor * c_attn_attn_k_b;
+
+    struct ggml_tensor * c_attn_attn_v_w;
+    struct ggml_tensor * c_attn_attn_v_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+
+    struct ggml_tensor * gpu_idx;
+    struct ggml_tensor * gpu_bucket;
+    // gpu heat
+    struct ggml_tensor * c_mlp_fc_w_gpu;
+    struct ggml_tensor * c_mlp_proj_w_t;
+    struct ggml_tensor * c_mlp_proj_w_gpu;
+
+    //predictor
+    struct ggml_tensor * mlp_pre_w1_w;
+    struct ggml_tensor * mlp_pre_w2_w;
+};
+
+struct opt_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    opt_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error("opt_file fail\n");
+		}
+		seek(0, SEEK_END);
+		size = tell();
+		seek(0, SEEK_SET);
+    }
+	size_t tell() const {
+#ifdef _WIN32
+		__int64 ret = _ftelli64(fp);
+#else
+		long ret = std::ftell(fp);
+#endif
+		GGML_ASSERT(ret != -1); // this really shouldn't fail
+		return (size_t) ret;
+	}
+
+	void seek(size_t offset, int whence) {
+#ifdef _WIN32
+		int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+		int ret = std::fseek(fp, (long) offset, whence);
+#endif
+		GGML_ASSERT(ret == 0); // same
+	}
+
+    ~opt_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+#define _POSIX_MAPPED_FILES
+#include <sys/types.h>
+#include <sys/mman.h>
+
+struct opt_mmap {
+    void * addr;
+    size_t size;
+
+    opt_mmap(const opt_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+        // prefetch/readahead impairs performance on NUMA systems
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error("mmap failed\n");
+        }
+
+        if (prefetch > 0) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+        if (numa) {
+            // advise the kernel not to use readahead
+            // (because the next page might not belong on the same node)
+            if (madvise(addr, file->size, MADV_RANDOM)) {
+                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~opt_mmap() {
+        munmap(addr, size);
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
+        (void) prefetch;
+        (void) numa;
+
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
+#endif
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+    struct opt_file * file;
+    struct opt_mmap * mapping;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor **> tensors;
+};
+
+struct ggml_context * ctx0 = nullptr;
+// std::vector<uint8_t> compute_buffer;
+void *compute_buffer;
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+    model.file = new opt_file(fname.c_str(), "rb");
+    printf("size %d\n", model.file->size);
+    model.mapping = new opt_mmap(model.file, 0, false);
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        /* int32_t n_vocab = 0; */
+        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
+
+        /* if (n_vocab != model.hparams.n_vocab) { */
+        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
+        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
+        /*     return false; */
+        /* } */
+        int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+    printf("wtype %d\n", wtype);
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        //need refactor
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
+        //predictor
+        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size = 0;
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*51200; // object overhead
+
+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+    int main_gpu = 0;
+#if defined(GGML_USE_CUBLAS)
+    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    ggml_cuda_set_main_device(main_gpu);
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+#else
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+#endif
+    
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
+        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
+
+        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
+        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        
+        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
+
+        // map by name
+        model.tensors["output_norm.weight"] = &model.ln_f_g;
+        model.tensors["output_norm.bias"] = &model.ln_f_b;
+
+        model.tensors["tok_embeddings.weight"]     = &model.wte;
+        model.tensors["pos_embeddings.weight"]     = &model.wpe;
+        model.tensors["output.weight"] = &model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+            memset(&layer, 0, sizeof(gpt2_layer));
+
+        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+        //     // need refine
+        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
+        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
+        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
+
+        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
+        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
+
+        //     if (i <= 10) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
+        //     } else if (i <= 12) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
+        //     } else if (i <= 18) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
+
+        //     } else if (i <= 21) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
+        //     } else if (i <= 26) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
+        //     } else if (i <= 31) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
+        //     }
+
+        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
+
+        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
+
+            // map by name
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
+
+            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
+            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
+            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
+            
+            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
+            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
+        }
+    }
+
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        #ifdef GGML_USE_CUBLAS
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
+        #endif
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+    ggml_set_no_alloc(ctx, true);
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+        const std::vector<std::string> to_gpu = {
+                "output_norm.bias",
+                "output_norm.weight",
+                // ".*attention.wq.weight",
+                // ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                // ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                // ".*feed_forward.w1.weight$",
+                ".*feed_forward.w1.bias",
+                // ".*feed_forward.w2.weight_h20$",
+                ".*feed_forward.w2.weight_transpose",
+                // ".*feed_forward.w2.weight$",
+                /* ".*feed_forward.w2.weight$", */
+                ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                "layers.*output_norm.weight",
+                "layers.*output_norm.bias",
+                ".*fc1.weight",
+                ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_gpu_lv = {
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                // "layers.*output_norm.weight",
+                // "layers.*output_norm.bias",
+                // ".*fc1.weight",
+                // ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_lock = {
+                "tok_embeddings.weight",
+                "pos_embeddings.weight",
+                // "output_norm.bias",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                // ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                // ".*attention.wo.bias",
+                ".*feed_forward.w1.weight",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_transpose",
+                // ".*feed_forward.w2.weight",
+                ".*feed_forward.w2.bias",
+                ".*gpu.weight",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                ".*output_norm.weight",
+                ".*output_norm.bias",
+                ".*attention.*fc1.weight",
+                ".*attention.*fc1.bias",
+                ".*attention.*fc2.weight",
+                ".*attention.*fc2.bias",
+                // ".*w2.bias",
+                // ".*w1.bias",
+                "output.weight",
+            };
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            int64_t new_ne[2];
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+                new_ne[i] = ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+            ggml_tensor ** ptr = model.tensors[name];
+            // printf("name %s ptr %p\n", name.c_str(), *ptr);
+            // int k;
+            // scanf("%d", &k);
+            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
+
+            auto tensor = (ggml_tensor *)*model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            std::streampos offset = fin.tellg();
+            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
+            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
+
+            // // GPT-2 models share the WTE tensor as the LM head
+            // if (name == "model/wte" && has_lm_head == false) {
+            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            // }
+
+            // if (name == "model/lm_head") {
+            //     has_lm_head = true;
+            // }
+            if (model_params.low_vram == false) {
+                for (const auto &s : to_gpu)
+                {
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+            } else {
+                for (const auto &s : to_gpu_lv)
+                {
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        std::regex pattern(R"(\d+)");
+                        std::smatch match;
+                        int layer_id = 0;
+                        if (std::regex_search(name, match, pattern))
+                        {
+                            std::string digitStr = match.str();
+                            int num = std::stoi(digitStr);
+                            layer_id = num;
+                        }
+                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                        if (layer_id > model_params.n_gpu_layers)
+                            break;
+                        // printf("name %s\n", name.c_str());
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+
+            }
+            if (tensor->backend == GGML_BACKEND_GPU) {
+                #if defined(GGML_USE_CUBLAS)
+                ggml_cuda_transform_tensor(tensor->data, tensor);
+                #endif
+            }
+            for (const auto &s : to_lock)
+            {
+                if (std::regex_match(name, std::regex(s)))
+                {
+                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
+                        // printf("mlock %s\n", name.c_str());
+                    }
+                    else {
+                        printf("mlock failed %s\n", name.c_str());
+                    }
+                }
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+        ggml_set_no_alloc(ctx, false);
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    // static std::vector<uint8_t> buf(buf_size);
+    static void * buf = ggml_cuda_host_malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    ctx0 = ggml_init(params);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, embd);
+
+    // avoid writing to tensors if we are only measuring the memory usage
+    if (!ggml_allocr_is_measure(allocr)) {
+        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            ((int32_t *) position->data)[i] = n_past + i + 2;
+        }
+    }
+    offload_func_t offload_func = opt_nop;
+    offload_func_t offload_func_kq = opt_nop;
+    offload_func_t offload_func_v = opt_nop;
+    offload_func_t offload_func_nr = opt_nop;
+    offload_func_t offload_debug = opt_nop;
+#ifdef GGML_USE_CUBLAS
+    // offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
+#endif
+    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // int k; 
+    // scanf("%d", &k); 
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+    ggml_set_name(inpL, "inpL_first");
+    // offload_func(inpL);
+
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            offload_func(cur);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_mul(ctx0,
+                        cur,
+                        model.layers[il].ln_1_g);
+            offload_func(cur);
+            ggml_set_name(cur, "ln_1_g");
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].ln_1_b);
+            ggml_set_name(cur, "ln_1_b");
+            // offload_func(cur);
+            
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+
+        struct ggml_tensor *k_cpy = nullptr;
+        struct ggml_tensor *v_cpy = nullptr;
+        // self-attention
+        {
+            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
+            offload_func_kq(Qcur);
+            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
+            offload_func_kq(Qcur);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
+            offload_func_kq(Kcur);
+            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
+            offload_func_kq(Kcur);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
+            offload_func_v(Vcur);
+            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
+            offload_func_v(Vcur);
+
+            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+            offload_func_v(Vcur);
+
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                offload_func_kq(k);
+                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
+
+                offload_func_v(v);
+                k_cpy = ggml_cpy(ctx0, Kcur, k);
+                offload_func_kq(k_cpy);
+                ggml_set_name(k_cpy, "k_cpy");
+                v_cpy = ggml_cpy(ctx0, Vcur, v);
+                offload_func_v(v_cpy);
+                ggml_set_name(v_cpy, "v_cpy");
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
+            offload_func_kq(Qcur);
+             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            ggml_set_name(Q, "Q");
+            offload_func_kq(Q);
+
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            // struct ggml_tensor * K =
+            //     ggml_permute(ctx0,
+            //             ggml_reshape_3d(ctx0,
+            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            //                 n_embd/n_head, n_head, n_past + N),
+            //             0, 2, 1, 3);
+            
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, model.memory_k,
+                        128, n_past + N, n_head,
+                        ggml_element_size(model.memory_k)*n_embd,
+                        ggml_element_size(model.memory_k)*128,
+                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
+            K->src[1] = k_cpy;
+            offload_func_kq(K);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            offload_func_kq(KQ);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        KQ_scale);
+            offload_func_kq(KQ_scaled);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            offload_func_kq(KQ_masked);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            offload_func_v(KQ_soft_max);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, 128, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*128,
+                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
+            V->src[1] = v_cpy;
+            offload_func_v(V);
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            offload_func_v(KQV);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            offload_func_v(KQV_merged);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            ggml_set_name(cur, "KQV_merge_cont");
+            offload_func_v(cur);
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+            ggml_set_name(cur, "attn_proj");
+            offload_func(cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_proj_b);
+            ggml_set_name(cur, "attn_bias");
+            offload_func(cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+        offload_func(cur);
+        ggml_set_name(cur, "after attn");
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            ggml_tensor *idx = nullptr;
+            ggml_tensor *idx_g = nullptr;
+            ggml_tensor *cur_c = nullptr;
+            
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN");
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_mul(ctx0,
+                            cur,
+                            model.layers[il].ln_2_g);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN_g");
+                cur = ggml_add(ctx0,
+                        cur, 
+                        model.layers[il].ln_2_b);
+                // offload_func(cur);
+                // ggml_set_name(cur, "norm_FFN_w");
+                // cur_c = ggml_dup(ctx0, cur);
+            }
+            // if (N == 1)
+            if (1)
+            {
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w1_w,
+                                   inpFF);
+                offload_func(idx);
+                ggml_set_name(idx, "mlp_pre_w1");
+                idx = ggml_relu(ctx0, idx);
+                offload_func(idx);
+                ggml_set_name(idx, "relu_pre");
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w2_w,
+                                   idx);
+                ggml_set_name(idx, "mlp_pre_w2");
+                // offload_func(idx);
+                // idx = ggml_sigmoid(ctx0, idx);
+                // offload_func(idx);
+                // idx_g = idx;
+                // idx = ggml_dup(ctx0, idx_g);
+                // ggml_set_name(idx, "idx_cpu_dup");
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            if (N != 1)
+            // if (0)
+            {
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_fc_w,
+                                   cur);
+                offload_func(cur);
+                ggml_set_name(cur, "up_ffn");
+                cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
+                offload_func(cur);
+            }
+            else 
+            {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_fc_w,
+                //                    cur);
+                // offload_func(cur);
+                // cur = ggml_add(ctx0,
+                //     cur,
+                //     model.layers[il].c_mlp_fc_b);
+                // offload_func(cur);
+
+                
+                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
+                model.layers[il].c_mlp_fc_w_gpu,
+                // model.layers[il].c_mlp_fc_w,
+                cur,
+                idx,
+                model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "mlp_up_gpu");
+                offload_func(tmp);
+                offload_debug(tmp);
+                cur = ggml_mul_mat_idx(ctx0,
+                                       model.layers[il].c_mlp_fc_w,
+                                       cur,
+                                       idx,
+                                       model.layers[il].gpu_idx);
+                ggml_set_name(cur, "mlp_up_cpu");
+
+                // cur = ggml_add_idx(ctx0,
+                //     cur,
+                //     model.layers[il].c_mlp_fc_b,
+                //     idx);
+                // offload_func(cur);
+                tmp = ggml_add_idx(ctx0,
+                                tmp,
+                                model.layers[il].c_mlp_fc_b,
+                                idx);
+                offload_debug(tmp);
+
+
+            // cur = ggml_add(ctx0, cur, tmp);
+            // ggml_set_name(cur, "mlp_up_mix");
+            // offload_func(cur);
+
+                // cur = tmp;
+
+            }
+
+            
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_relu(ctx0, cur);
+            // cur_c = cur;
+            // offload_func(cur);
+            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            // if (N != 1) {
+            if (0) { 
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_proj_w,
+                                   cur);
+                offload_func(cur);
+                ggml_set_name(cur, "down_ffn");
+
+                cur = ggml_add(ctx0,
+                               cur,
+                               model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+            }
+            else {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_proj_w,
+                //                    cur);
+                // offload_func(cur);
+                
+                // cur = ggml_axpy(ctx0, 
+                // model.layers[il].c_mlp_proj_w_t,
+                // cur,
+                // NULL,
+                // NULL);
+                // offload_func(cur);
+
+
+                //here
+                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
+                // model.layers[il].c_mlp_proj_w_gpu,
+                // cur,
+                // model.layers[il].gpu_bucket,
+                // NULL);
+                // ggml_set_name(tmp, "axpy");
+                // offload_func(tmp);
+                // offload_debug(tmp);
+                cur = ggml_axpy(ctx0, 
+                model.layers[il].c_mlp_proj_w_t,
+                cur_c,
+                // NULL,
+                // NULL);
+                idx,
+                model.layers[il].gpu_bucket);
+                // model.layers[il].gpu_idx);
+                // offload_func(cur);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+                
+                // tmp = ggml_add(ctx0,
+                //                tmp,
+                //                model.layers[il].c_mlp_proj_b);
+                // offload_func(tmp);
+                // offload_debug(tmp);
+
+                // cur = ggml_add(ctx0, cur, tmp);
+                // offload_func(cur);
+            }
+            
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+        offload_func(inpL);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        offload_func_nr(inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_mul(ctx0,
+                    inpL,
+                    model.ln_f_g);
+        offload_func_nr(inpL);
+        inpL = ggml_add(ctx0,
+                inpL,
+                model.ln_f_b);
+        ggml_set_name(inpL, "before");
+        offload_func_nr(inpL);
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    ggml_set_name(inpL, "last_layer");
+// offload_func(inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - allocr:    ggml_allocr to use to allocate the compute buffer
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_vocab;
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+#ifdef GGML_USE_CUBLAS
+    for (int i = 0; i < gf->n_leafs; i++) {
+        ggml_tensor * node = gf->leafs[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        ggml_tensor * node = gf->nodes[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+#endif
+
+
+
+    // run the computation
+    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> work_buffer;
+    work_buffer.resize(plan.work_size);
+    plan.work_data = work_buffer.data();
+    ggml_graph_compute(gf, &plan);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    // in this case, the output tensor is the last one in the graph
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-2-117M/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(params.model, model, vocab, params)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, "hello world");
+    }
+    printf("load finish\n");
+
+    // keep this buffer alive while evaluating the model
+
+    struct ggml_allocr * allocr = NULL;
+    // allocate the compute buffer
+    {
+        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+
+        // create the worst case graph for memory usage estimation
+        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
+        int n_past = model.hparams.n_ctx - n_tokens;
+        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(allocr);
+        // compute_buffer.resize(mem_size);
+        compute_buffer = ggml_cuda_host_malloc(mem_size);
+        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
+
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+        printf("%d ", embd_inp[i]);
+    }
+    printf("\n\n");
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    int cnt = 0;
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+            cnt += 1;
+
+            if (cnt > 0)
+                t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            const int   top_k = params.top_k;
+            const float top_p = params.top_p;
+            const float temp  = params.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 50256) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/examples/gpt-2-sparse/main13b.cpp b/examples/gpt-2-sparse/main13b.cpp
new file mode 100644
index 00000000..0681da3e
--- /dev/null
+++ b/examples/gpt-2-sparse/main13b.cpp
@@ -0,0 +1,1583 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include <regex>
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "ggml-cuda.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+typedef void (*offload_func_t)(struct ggml_tensor * tensor);
+void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
+    (void) tensor;
+}
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    // struct ggml_tensor * c_attn_attn_w;
+    // struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_attn_q_w;
+    struct ggml_tensor * c_attn_attn_q_b;
+
+    struct ggml_tensor * c_attn_attn_k_w;
+    struct ggml_tensor * c_attn_attn_k_b;
+
+    struct ggml_tensor * c_attn_attn_v_w;
+    struct ggml_tensor * c_attn_attn_v_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+
+    struct ggml_tensor * gpu_idx;
+    struct ggml_tensor * gpu_bucket;
+    // gpu heat
+    struct ggml_tensor * c_mlp_fc_w_gpu;
+    struct ggml_tensor * c_mlp_proj_w_t;
+    struct ggml_tensor * c_mlp_proj_w_gpu;
+
+    //predictor
+    struct ggml_tensor * mlp_pre_w1_w;
+    struct ggml_tensor * mlp_pre_w2_w;
+};
+
+struct opt_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    opt_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error("opt_file fail\n");
+		}
+		seek(0, SEEK_END);
+		size = tell();
+		seek(0, SEEK_SET);
+    }
+	size_t tell() const {
+#ifdef _WIN32
+		__int64 ret = _ftelli64(fp);
+#else
+		long ret = std::ftell(fp);
+#endif
+		GGML_ASSERT(ret != -1); // this really shouldn't fail
+		return (size_t) ret;
+	}
+
+	void seek(size_t offset, int whence) {
+#ifdef _WIN32
+		int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+		int ret = std::fseek(fp, (long) offset, whence);
+#endif
+		GGML_ASSERT(ret == 0); // same
+	}
+
+    ~opt_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+#define _POSIX_MAPPED_FILES
+#include <sys/types.h>
+#include <sys/mman.h>
+
+struct opt_mmap {
+    void * addr;
+    size_t size;
+
+    opt_mmap(const opt_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+        // prefetch/readahead impairs performance on NUMA systems
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error("mmap failed\n");
+        }
+
+        if (prefetch > 0) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+        if (numa) {
+            // advise the kernel not to use readahead
+            // (because the next page might not belong on the same node)
+            if (madvise(addr, file->size, MADV_RANDOM)) {
+                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~opt_mmap() {
+        munmap(addr, size);
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
+        (void) prefetch;
+        (void) numa;
+
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
+#endif
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+    struct opt_file * file;
+    struct opt_mmap * mapping;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor **> tensors;
+};
+
+struct ggml_context * ctx0 = nullptr;
+// std::vector<uint8_t> compute_buffer;
+void *compute_buffer;
+
+bool endsWith(const std::string& str, const std::string& suffix) {
+    if (str.length() < suffix.length()) {
+        return false;
+    }
+    return str.substr(str.length() - suffix.length()) == suffix;
+}
+
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+    model.file = new opt_file(fname.c_str(), "rb");
+    printf("size %d\n", model.file->size);
+    model.mapping = new opt_mmap(model.file, 0, false);
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        /* int32_t n_vocab = 0; */
+        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
+
+        /* if (n_vocab != model.hparams.n_vocab) { */
+        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
+        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
+        /*     return false; */
+        /* } */
+        int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+    printf("wtype %d\n", wtype);
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        //need refactor
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
+        //predictor
+        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size = 0;
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*51200; // object overhead
+
+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+    int main_gpu = 0;
+#if defined(GGML_USE_CUBLAS)
+    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    ggml_cuda_set_main_device(main_gpu);
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+#else
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+#endif
+    
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
+        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
+
+        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
+        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        
+        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
+
+        // map by name
+        model.tensors["output_norm.weight"] = &model.ln_f_g;
+        model.tensors["output_norm.bias"] = &model.ln_f_b;
+
+        model.tensors["tok_embeddings.weight"]     = &model.wte;
+        model.tensors["pos_embeddings.weight"]     = &model.wpe;
+        model.tensors["output.weight"] = &model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+            memset(&layer, 0, sizeof(gpt2_layer));
+
+        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+        //     // need refine
+        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
+        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
+        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
+
+        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
+        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
+
+        //     if (i <= 10) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
+        //     } else if (i <= 12) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
+        //     } else if (i <= 18) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
+
+        //     } else if (i <= 21) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
+        //     } else if (i <= 26) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
+        //     } else if (i <= 31) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
+        //     }
+
+        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
+
+        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
+
+            // map by name
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
+
+            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
+            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
+            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
+            
+            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
+            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
+        }
+    }
+
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        #ifdef GGML_USE_CUBLAS
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
+            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
+        #endif
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+    ggml_set_no_alloc(ctx, true);
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+        const std::vector<std::string> to_gpu = {
+                "output_norm.bias",
+                "output_norm.weight",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                // ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                "layers.*output_norm.weight",
+                "layers.*output_norm.bias",
+                ".*fc1.weight",
+                ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_gpu_lv = {
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                // "layers.*output_norm.weight",
+                // "layers.*output_norm.bias",
+                // ".*fc1.weight",
+                // ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_lock = {
+                "tok_embeddings.weight",
+                "pos_embeddings.weight",
+                // "output_norm.bias",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                // ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                // ".*attention.wo.bias",
+                ".*feed_forward.w1.weight",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_transpose",
+                // ".*feed_forward.w2.weight",
+                ".*feed_forward.w2.bias",
+                ".*gpu.weight",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                ".*output_norm.weight",
+                ".*output_norm.bias",
+                ".*attention.*fc1.weight",
+                ".*attention.*fc1.bias",
+                ".*attention.*fc2.weight",
+                ".*attention.*fc2.bias",
+                // ".*w2.bias",
+                // ".*w1.bias",
+                "output.weight",
+            };
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            int64_t new_ne[2];
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+                new_ne[i] = ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+            ggml_tensor ** ptr = model.tensors[name];
+            // printf("name %s ptr %p\n", name.c_str(), *ptr);
+            // int k;
+            // scanf("%d", &k);
+            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
+
+            auto tensor = (ggml_tensor *)*model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            std::streampos offset = fin.tellg();
+            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
+            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
+            // if ( endsWith(name.c_str(), "weight_transpose")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i+4096]);
+            //     }
+            // }
+            // printf("\n");
+            // if (endsWith(name.c_str(), "weight_h20")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i]);
+
+            //     }
+            //     int k;
+            //     scanf("%d", &k);
+            // }
+
+            // // GPT-2 models share the WTE tensor as the LM head
+            // if (name == "model/wte" && has_lm_head == false) {
+            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            // }
+
+            // if (name == "model/lm_head") {
+            //     has_lm_head = true;
+            // }
+            if (model_params.low_vram == false) {
+                for (const auto &s : to_gpu)
+                {
+                    // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight")))
+                    // {
+                    //     std::regex pattern(R"(\d+)");
+                    //     std::smatch match;
+                    //     int layer_id = 0;
+                    //     if (std::regex_search(name, match, pattern))
+                    //     {
+                    //         std::string digitStr = match.str();
+                    //         int num = std::stoi(digitStr);
+                    //         layer_id = num;
+                    //     }
+                    //     printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                    //     if (layer_id > model_params.n_gpu_layers)
+                    //         break;
+                    // }
+                    // printf("name %s\n", name.c_str());
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+            } else {
+                for (const auto &s : to_gpu_lv)
+                {
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        std::regex pattern(R"(\d+)");
+                        std::smatch match;
+                        int layer_id = 0;
+                        if (std::regex_search(name, match, pattern))
+                        {
+                            std::string digitStr = match.str();
+                            int num = std::stoi(digitStr);
+                            layer_id = num;
+                        }
+                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                        if (layer_id > model_params.n_gpu_layers)
+                            break;
+                        // printf("name %s\n", name.c_str());
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+
+            }
+            if (tensor->backend == GGML_BACKEND_GPU) {
+                #if defined(GGML_USE_CUBLAS)
+                ggml_cuda_transform_tensor(tensor->data, tensor);
+                #endif
+            }
+            for (const auto &s : to_lock)
+            {
+                if (std::regex_match(name, std::regex(s)))
+                {
+                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
+                        // printf("mlock %s\n", name.c_str());
+                    }
+                    else {
+                        printf("mlock failed %s\n", name.c_str());
+                    }
+                }
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+        ggml_set_no_alloc(ctx, false);
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    // static std::vector<uint8_t> buf(buf_size);
+    static void * buf = ggml_cuda_host_malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    ctx0 = ggml_init(params);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, embd);
+
+    // avoid writing to tensors if we are only measuring the memory usage
+    if (!ggml_allocr_is_measure(allocr)) {
+        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            ((int32_t *) position->data)[i] = n_past + i + 2;
+        }
+    }
+    offload_func_t offload_func = opt_nop;
+    offload_func_t offload_func_kq = opt_nop;
+    offload_func_t offload_func_v = opt_nop;
+    offload_func_t offload_func_nr = opt_nop;
+    offload_func_t offload_debug = opt_nop;
+#ifdef GGML_USE_CUBLAS
+    offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
+    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
+#endif
+    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // int k; 
+    // scanf("%d", &k); 
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+    ggml_set_name(inpL, "inpL_first");
+    // offload_func(inpL);
+
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            offload_func(cur);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_mul(ctx0,
+                        cur,
+                        model.layers[il].ln_1_g);
+            offload_func(cur);
+            ggml_set_name(cur, "ln_1_g");
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].ln_1_b);
+            ggml_set_name(cur, "ln_1_b");
+            // offload_func(cur);
+            
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+
+        struct ggml_tensor *k_cpy = nullptr;
+        struct ggml_tensor *v_cpy = nullptr;
+        // self-attention
+        {
+            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
+            offload_func_kq(Qcur);
+            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
+            offload_func_kq(Qcur);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
+            offload_func_kq(Kcur);
+            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
+            offload_func_kq(Kcur);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
+            offload_func_v(Vcur);
+            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
+            offload_func_v(Vcur);
+
+            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+            offload_func_v(Vcur);
+
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                offload_func_kq(k);
+                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
+
+                offload_func_v(v);
+                k_cpy = ggml_cpy(ctx0, Kcur, k);
+                offload_func_kq(k_cpy);
+                ggml_set_name(k_cpy, "k_cpy");
+                v_cpy = ggml_cpy(ctx0, Vcur, v);
+                offload_func_v(v_cpy);
+                ggml_set_name(v_cpy, "v_cpy");
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
+            offload_func_kq(Qcur);
+             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            ggml_set_name(Q, "Q");
+            offload_func_kq(Q);
+
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            // struct ggml_tensor * K =
+            //     ggml_permute(ctx0,
+            //             ggml_reshape_3d(ctx0,
+            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            //                 n_embd/n_head, n_head, n_past + N),
+            //             0, 2, 1, 3);
+            
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, model.memory_k,
+                        128, n_past + N, n_head,
+                        ggml_element_size(model.memory_k)*n_embd,
+                        ggml_element_size(model.memory_k)*128,
+                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
+            K->src[1] = k_cpy;
+            offload_func_kq(K);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            offload_func_kq(KQ);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        KQ_scale);
+            offload_func_kq(KQ_scaled);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            offload_func_kq(KQ_masked);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            offload_func_v(KQ_soft_max);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, 128, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*128,
+                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
+            V->src[1] = v_cpy;
+            offload_func_v(V);
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            offload_func_v(KQV);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            offload_func_v(KQV_merged);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            ggml_set_name(cur, "KQV_merge_cont");
+            offload_func_v(cur);
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+            ggml_set_name(cur, "attn_proj");
+            offload_func(cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_proj_b);
+            ggml_set_name(cur, "attn_bias");
+            offload_func(cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+        offload_func(cur);
+        ggml_set_name(cur, "after attn");
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            ggml_tensor *idx = nullptr;
+            ggml_tensor *idx_g = nullptr;
+            ggml_tensor *cur_c = nullptr;
+            
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN");
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_mul(ctx0,
+                            cur,
+                            model.layers[il].ln_2_g);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN_g");
+                cur = ggml_add(ctx0,
+                        cur, 
+                        model.layers[il].ln_2_b);
+                // offload_func(cur);
+                // ggml_set_name(cur, "norm_FFN_w");
+                // cur_c = ggml_dup(ctx0, cur);
+            }
+            // if (N == 1)
+            if (1)
+            {
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w1_w,
+                                   cur);
+                offload_func(idx);
+                ggml_set_name(idx, "mlp_pre_w1");
+                idx = ggml_relu(ctx0, idx);
+                offload_func(idx);
+                ggml_set_name(idx, "relu_pre");
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w2_w,
+                                   idx);
+                ggml_set_name(idx, "mlp_pre_w2");
+                // offload_func(idx);
+                // idx = ggml_sigmoid(ctx0, idx);
+                // offload_func(idx);
+                // idx_g = idx;
+                // idx = ggml_dup(ctx0, idx_g);
+                // ggml_set_name(idx, "idx_cpu_dup");
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            // if (N != 1)
+            if (0)
+            {
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_fc_w,
+                                   cur);
+                offload_func(cur);
+                ggml_set_name(cur, "up_ffn");
+                cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
+                offload_func(cur);
+            }
+            else 
+            {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_fc_w,
+                //                    cur);
+                // offload_func(cur);
+                // cur = ggml_add(ctx0,
+                //     cur,
+                //     model.layers[il].c_mlp_fc_b);
+                // offload_func(cur);
+
+                
+                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
+                model.layers[il].c_mlp_fc_w_gpu,
+                cur,
+                idx,
+                model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "mlp_up_gpu");
+                offload_func(tmp);
+                offload_debug(tmp);
+                cur = ggml_mul_mat_idx(ctx0,
+                                       model.layers[il].c_mlp_fc_w,
+                                       cur,
+                                       idx,
+                                       model.layers[il].gpu_idx);
+                ggml_set_name(cur, "mlp_up_cpu");
+                tmp = ggml_add_idx(ctx0,
+                    tmp,
+                    model.layers[il].c_mlp_fc_b,
+                    idx);
+                ggml_set_name(tmp, "mlp_up_bias");
+                offload_debug(tmp);
+                offload_func(tmp);
+
+            cur = ggml_add(ctx0, cur, tmp);
+            ggml_set_name(cur, "mlp_up_mix");
+            offload_func(cur);
+
+                // cur = tmp;
+
+            }
+
+            
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_relu(ctx0, cur);
+            // cur_c = cur;
+            // offload_func(cur);
+            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            // if (N != 1) {
+            if (0) { 
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_proj_w,
+                                   cur);
+                offload_func(cur);
+                ggml_set_name(cur, "down_ffn");
+
+                cur = ggml_add(ctx0,
+                               cur,
+                               model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+            }
+            else {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_proj_w,
+                //                    cur);
+                // offload_func(cur);
+                
+                // cur = ggml_axpy(ctx0, 
+                // model.layers[il].c_mlp_proj_w_t,
+                // cur,
+                // NULL,
+                // NULL);
+                // offload_func(cur);
+
+
+                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
+                // model.layers[il].c_mlp_proj_w_gpu,
+                // cur,
+                // model.layers[il].gpu_bucket,
+                // NULL);
+                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
+                    model.layers[il].c_mlp_proj_w_gpu,
+                    cur,
+                    idx,
+                    model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "axpy");
+                offload_func(tmp);
+                offload_debug(tmp);
+
+                cur = ggml_axpy(ctx0, 
+                model.layers[il].c_mlp_proj_w_t,
+                cur_c,
+                idx,
+                model.layers[il].gpu_idx);
+
+                cur = ggml_add(ctx0, cur, tmp);
+                offload_func(cur);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+                
+                // tmp = ggml_add(ctx0,
+                //                tmp,
+                //                model.layers[il].c_mlp_proj_b);
+                // offload_func(tmp);
+                // offload_debug(tmp);
+
+                // cur = tmp;
+            }
+            
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+        offload_func(inpL);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        offload_func_nr(inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_mul(ctx0,
+                    inpL,
+                    model.ln_f_g);
+        offload_func_nr(inpL);
+        inpL = ggml_add(ctx0,
+                inpL,
+                model.ln_f_b);
+        ggml_set_name(inpL, "before");
+        offload_func_nr(inpL);
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    ggml_set_name(inpL, "last_layer");
+// offload_func(inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - allocr:    ggml_allocr to use to allocate the compute buffer
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_vocab;
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+#ifdef GGML_USE_CUBLAS
+    for (int i = 0; i < gf->n_leafs; i++) {
+        ggml_tensor * node = gf->leafs[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        ggml_tensor * node = gf->nodes[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+#endif
+
+
+
+    // run the computation
+    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> work_buffer;
+    work_buffer.resize(plan.work_size);
+    plan.work_data = work_buffer.data();
+    ggml_graph_compute(gf, &plan);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    // in this case, the output tensor is the last one in the graph
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-2-117M/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(params.model, model, vocab, params)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, "hello world");
+    }
+    printf("load finish\n");
+
+    // keep this buffer alive while evaluating the model
+
+    struct ggml_allocr * allocr = NULL;
+    // allocate the compute buffer
+    {
+        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+
+        // create the worst case graph for memory usage estimation
+        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
+        int n_past = model.hparams.n_ctx - n_tokens;
+        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(allocr);
+        // compute_buffer.resize(mem_size);
+        compute_buffer = ggml_cuda_host_malloc(mem_size);
+        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
+
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+        printf("%d ", embd_inp[i]);
+    }
+    printf("\n\n");
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    int cnt = 0;
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+            cnt += 1;
+
+            if (cnt > 0)
+                t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            llama_sampling_params & sparams = params.sparams;
+            const int   top_k = sparams.top_k;
+            const float top_p = sparams.top_p;
+            const float temp  = sparams.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 50256) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/examples/gpt-2-sparse/main7b.cpp b/examples/gpt-2-sparse/main7b.cpp
new file mode 100644
index 00000000..a07a5472
--- /dev/null
+++ b/examples/gpt-2-sparse/main7b.cpp
@@ -0,0 +1,1567 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include <regex>
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "ggml-cuda.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+typedef void (*offload_func_t)(struct ggml_tensor * tensor);
+void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
+    (void) tensor;
+}
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+    float   eps     = 1e-5f;
+};
+
+struct gpt2_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    // struct ggml_tensor * c_attn_attn_w;
+    // struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_attn_q_w;
+    struct ggml_tensor * c_attn_attn_q_b;
+
+    struct ggml_tensor * c_attn_attn_k_w;
+    struct ggml_tensor * c_attn_attn_k_b;
+
+    struct ggml_tensor * c_attn_attn_v_w;
+    struct ggml_tensor * c_attn_attn_v_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+
+    struct ggml_tensor * gpu_idx;
+    struct ggml_tensor * gpu_bucket;
+    // gpu heat
+    struct ggml_tensor * c_mlp_fc_w_gpu;
+    struct ggml_tensor * c_mlp_proj_w_t;
+    struct ggml_tensor * c_mlp_proj_w_gpu;
+
+    //predictor
+    struct ggml_tensor * mlp_pre_w1_w;
+    struct ggml_tensor * mlp_pre_w2_w;
+};
+
+struct opt_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    opt_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error("opt_file fail\n");
+		}
+		seek(0, SEEK_END);
+		size = tell();
+		seek(0, SEEK_SET);
+    }
+	size_t tell() const {
+#ifdef _WIN32
+		__int64 ret = _ftelli64(fp);
+#else
+		long ret = std::ftell(fp);
+#endif
+		GGML_ASSERT(ret != -1); // this really shouldn't fail
+		return (size_t) ret;
+	}
+
+	void seek(size_t offset, int whence) {
+#ifdef _WIN32
+		int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+		int ret = std::fseek(fp, (long) offset, whence);
+#endif
+		GGML_ASSERT(ret == 0); // same
+	}
+
+    ~opt_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+#define _POSIX_MAPPED_FILES
+#include <sys/types.h>
+#include <sys/mman.h>
+
+struct opt_mmap {
+    void * addr;
+    size_t size;
+
+    opt_mmap(const opt_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+        // prefetch/readahead impairs performance on NUMA systems
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (prefetch) { flags |= MAP_POPULATE; }
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error("mmap failed\n");
+        }
+
+        if (prefetch > 0) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+        if (numa) {
+            // advise the kernel not to use readahead
+            // (because the next page might not belong on the same node)
+            if (madvise(addr, file->size, MADV_RANDOM)) {
+                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~opt_mmap() {
+        munmap(addr, size);
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
+        (void) prefetch;
+        (void) numa;
+
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
+#endif
+};
+
+struct gpt2_model {
+    gpt2_hparams hparams;
+    struct opt_file * file;
+    struct opt_mmap * mapping;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head
+
+    std::vector<gpt2_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor **> tensors;
+};
+
+struct ggml_context * ctx0 = nullptr;
+// std::vector<uint8_t> compute_buffer;
+void *compute_buffer;
+
+bool endsWith(const std::string& str, const std::string& suffix) {
+    if (str.length() < suffix.length()) {
+        return false;
+    }
+    return str.substr(str.length() - suffix.length()) == suffix;
+}
+
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
+    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+    model.file = new opt_file(fname.c_str(), "rb");
+    printf("size %d\n", model.file->size);
+    model.mapping = new opt_mmap(model.file, 0, false);
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // load vocab
+    {
+        /* int32_t n_vocab = 0; */
+        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
+
+        /* if (n_vocab != model.hparams.n_vocab) { */
+        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
+        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
+        /*     return false; */
+        /* } */
+        int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        std::vector<char> buf(128);
+
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            fin.read((char *) &len, sizeof(len));
+
+            buf.resize(len);
+            fin.read((char *) buf.data(), len);
+            word.assign(buf.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+    printf("wtype %d\n", wtype);
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = 0;
+
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+
+        //need refactor
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
+        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
+        //predictor
+        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
+        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size = 0;
+
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+
+        ctx_size += (6 + 12*n_layer)*51200; // object overhead
+
+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+    int main_gpu = 0;
+#if defined(GGML_USE_CUBLAS)
+    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
+    ggml_cuda_set_main_device(main_gpu);
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+#else
+#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+#endif
+    
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
+        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
+
+        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
+        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        
+        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
+
+        // map by name
+        model.tensors["output_norm.weight"] = &model.ln_f_g;
+        model.tensors["output_norm.bias"] = &model.ln_f_b;
+
+        model.tensors["tok_embeddings.weight"]     = &model.wte;
+        model.tensors["pos_embeddings.weight"]     = &model.wpe;
+        model.tensors["output.weight"] = &model.lm_head;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+            memset(&layer, 0, sizeof(gpt2_layer));
+
+        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
+        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+        //     // need refine
+        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
+        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
+        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
+
+        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
+        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
+
+        //     if (i <= 10) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
+        //     } else if (i <= 12) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
+        //     } else if (i <= 18) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
+
+        //     } else if (i <= 21) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
+        //     } else if (i <= 26) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
+        //     } else if (i <= 31) {
+        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
+        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
+        //     }
+
+        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
+
+        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
+        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
+        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
+
+            // map by name
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
+            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
+
+            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
+            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
+
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
+
+            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
+            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
+
+            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
+            
+            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
+            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
+        }
+    }
+
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd  = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_ctx   = hparams.n_ctx;
+
+        const int n_mem      = n_layer*n_ctx;
+        const int n_elements = n_embd*n_mem;
+
+        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+        #ifdef GGML_USE_CUBLAS
+            ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
+            ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
+        #endif
+
+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+    }
+    ggml_set_no_alloc(ctx, true);
+    // load weights
+    {
+        size_t total_size = 0;
+
+        bool has_lm_head = false;
+        const std::vector<std::string> to_gpu = {
+                "output_norm.bias",
+                "output_norm.weight",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                // ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                "layers.*output_norm.weight",
+                "layers.*output_norm.bias",
+                ".*fc1.weight",
+                ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_gpu_lv = {
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                ".*attention.wk.weight",
+                ".*attention.wk.bias",
+                ".*attention.wv.weight",
+                ".*attention.wv.bias",
+                ".*attention.wo.weight",
+                ".*attention.wo.weight_transpose",
+                ".*attention.wo.bias",
+                ".*feed_forward.w1.weight_h20",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_h20$",
+                // ".*feed_forward.w2.weight_transpose",
+                /* ".*feed_forward.w2.weight$", */
+                ".*feed_forward.w2.bias",
+                ".*gpu.bucket",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                // "layers.*output_norm.weight",
+                // "layers.*output_norm.bias",
+                // ".*fc1.weight",
+                // ".*fc2.weight",
+                // ".*attention.*fc1.weight",
+                // ".*attention.*fc1.bias",
+                // ".*attention.*fc2.weight",
+                // ".*attention.*fc2.bias",
+
+                // "output.weight",
+                
+                // "model/h.*/attn/c_proj/w",
+                // "model/h.*/mlp/c_fc/w",
+                // "model/h.*/mlp/c_proj/w",
+            };
+            const std::vector<std::string> to_lock = {
+                "tok_embeddings.weight",
+                "pos_embeddings.weight",
+                // "output_norm.bias",
+                ".*attention.wq.weight",
+                ".*attention.wq.bias",
+                // ".*attention.wo.weight",
+                // ".*attention.wo.weight_transpose",
+                // ".*attention.wo.bias",
+                ".*feed_forward.w1.weight",
+                ".*feed_forward.w1.bias",
+                ".*feed_forward.w2.weight_transpose",
+                // ".*feed_forward.w2.weight",
+                ".*feed_forward.w2.bias",
+                ".*gpu.weight",
+                ".*attention_norm.weight",
+                ".*attention_norm.bias",
+                ".*output_norm.weight",
+                ".*output_norm.bias",
+                ".*attention.*fc1.weight",
+                ".*attention.*fc1.bias",
+                ".*attention.*fc2.weight",
+                ".*attention.*fc2.bias",
+                // ".*w2.bias",
+                // ".*w1.bias",
+                "output.weight",
+            };
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            int64_t new_ne[2];
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+                new_ne[i] = ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+                return false;
+            }
+            ggml_tensor ** ptr = model.tensors[name];
+            // printf("name %s ptr %p\n", name.c_str(), *ptr);
+            // int k;
+            // scanf("%d", &k);
+            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
+
+            auto tensor = (ggml_tensor *)*model.tensors[name];
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            
+
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            std::streampos offset = fin.tellg();
+            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
+            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
+            // if ( endsWith(name.c_str(), "weight_transpose")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i+4096]);
+            //     }
+            // }
+            // printf("\n");
+            // if (endsWith(name.c_str(), "weight_h20")) {
+            //     short *d = (short *)tensor->data;
+            //     for (int i = 0; i < 10; i++) {
+            //         printf("%d ", d[i]);
+
+            //     }
+            //     int k;
+            //     scanf("%d", &k);
+            // }
+
+            // // GPT-2 models share the WTE tensor as the LM head
+            // if (name == "model/wte" && has_lm_head == false) {
+            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            // }
+
+            // if (name == "model/lm_head") {
+            //     has_lm_head = true;
+            // }
+            if (model_params.low_vram == false) {
+                for (const auto &s : to_gpu)
+                {
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+            } else {
+                for (const auto &s : to_gpu_lv)
+                {
+                    if (std::regex_search(name, std::regex(s)))
+                    {
+                        std::regex pattern(R"(\d+)");
+                        std::smatch match;
+                        int layer_id = 0;
+                        if (std::regex_search(name, match, pattern))
+                        {
+                            std::string digitStr = match.str();
+                            int num = std::stoi(digitStr);
+                            layer_id = num;
+                        }
+                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
+                        if (layer_id > model_params.n_gpu_layers)
+                            break;
+                        // printf("name %s\n", name.c_str());
+                        tensor->backend = GGML_BACKEND_GPU;
+                        break;
+                    }
+                }
+
+            }
+            if (tensor->backend == GGML_BACKEND_GPU) {
+                #if defined(GGML_USE_CUBLAS)
+                ggml_cuda_transform_tensor(tensor->data, tensor);
+                #endif
+            }
+            for (const auto &s : to_lock)
+            {
+                if (std::regex_match(name, std::regex(s)))
+                {
+                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
+                        // printf("mlock %s\n", name.c_str());
+                    }
+                    else {
+                        printf("mlock failed %s\n", name.c_str());
+                    }
+                }
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+        ggml_set_no_alloc(ctx, false);
+
+        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+    }
+
+    fin.close();
+
+    return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+
+    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    // static std::vector<uint8_t> buf(buf_size);
+    static void * buf = ggml_cuda_host_malloc(buf_size);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    ctx0 = ggml_init(params);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, embd);
+
+    // avoid writing to tensors if we are only measuring the memory usage
+    if (!ggml_allocr_is_measure(allocr)) {
+        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            ((int32_t *) position->data)[i] = n_past + i + 2;
+        }
+    }
+    offload_func_t offload_func = opt_nop;
+    offload_func_t offload_func_kq = opt_nop;
+    offload_func_t offload_func_v = opt_nop;
+    offload_func_t offload_func_nr = opt_nop;
+    offload_func_t offload_debug = opt_nop;
+#ifdef GGML_USE_CUBLAS
+    offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    offload_func = ggml_cuda_assign_buffers_no_alloc; 
+    offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
+    offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
+    offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
+#endif
+    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
+    // int k; 
+    // scanf("%d", &k); 
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL =
+        ggml_add(ctx0,
+                ggml_get_rows(ctx0, model.wte, embd),
+                ggml_get_rows(ctx0, model.wpe, position));
+    ggml_set_name(inpL, "inpL_first");
+    // offload_func(inpL);
+
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            // [ 768, N]
+            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            offload_func(cur);
+
+            // cur = ln_1_g*cur + ln_1_b
+            // [ 768, N]
+            cur = ggml_mul(ctx0,
+                        cur,
+                        model.layers[il].ln_1_g);
+            offload_func(cur);
+            ggml_set_name(cur, "ln_1_g");
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].ln_1_b);
+            ggml_set_name(cur, "ln_1_b");
+            // offload_func(cur);
+            
+        }
+
+        // attn
+        // [2304, 768] - model.layers[il].c_attn_attn_w
+        // [2304,   1] - model.layers[il].c_attn_attn_b
+        // [ 768,   N] - cur (in)
+        // [2304,   N] - cur (out)
+        //
+        // cur = attn_w*cur + attn_b
+        // [2304, N]
+
+        struct ggml_tensor *k_cpy = nullptr;
+        struct ggml_tensor *v_cpy = nullptr;
+        // self-attention
+        {
+            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
+            offload_func_kq(Qcur);
+            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
+            offload_func_kq(Qcur);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
+            offload_func_kq(Kcur);
+            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
+            offload_func_kq(Kcur);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
+            offload_func_v(Vcur);
+            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
+            offload_func_v(Vcur);
+
+            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
+            offload_func_v(Vcur);
+
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                offload_func_kq(k);
+                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.memory_v),
+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
+
+                offload_func_v(v);
+                k_cpy = ggml_cpy(ctx0, Kcur, k);
+                offload_func_kq(k_cpy);
+                ggml_set_name(k_cpy, "k_cpy");
+                v_cpy = ggml_cpy(ctx0, Vcur, v);
+                offload_func_v(v_cpy);
+                ggml_set_name(v_cpy, "v_cpy");
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            // [64, N, 12]
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
+            offload_func_kq(Qcur);
+             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            ggml_set_name(Q, "Q");
+            offload_func_kq(Q);
+
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            // [64, n_past + N, 12]
+            // struct ggml_tensor * K =
+            //     ggml_permute(ctx0,
+            //             ggml_reshape_3d(ctx0,
+            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+            //                 n_embd/n_head, n_head, n_past + N),
+            //             0, 2, 1, 3);
+            
+            struct ggml_tensor * K =
+                ggml_view_3d(ctx0, model.memory_k,
+                        128, n_past + N, n_head,
+                        ggml_element_size(model.memory_k)*n_embd,
+                        ggml_element_size(model.memory_k)*128,
+                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
+            K->src[1] = k_cpy;
+            offload_func_kq(K);
+
+            // GG: flash attention
+            //struct ggml_tensor * V =
+            //    ggml_cpy(ctx0,
+            //            ggml_permute(ctx0,
+            //                ggml_reshape_3d(ctx0,
+            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+            //                    n_embd/n_head, n_head, n_past + N),
+            //                1, 2, 0, 3),
+            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+            // K * Q
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            offload_func_kq(KQ);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        KQ_scale);
+            offload_func_kq(KQ_scaled);
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            offload_func_kq(KQ_masked);
+
+            // KQ = soft_max(KQ_masked)
+            // [n_past + N, N, 12]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            offload_func_v(KQ_soft_max);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            // [n_past + N, 64, 12]
+
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.memory_v,
+                        n_past + N, 128, n_head,
+                        n_ctx*ggml_element_size(model.memory_v),
+                        n_ctx*ggml_element_size(model.memory_v)*128,
+                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
+            V->src[1] = v_cpy;
+            offload_func_v(V);
+
+            // KQV = transpose(V) * KQ_soft_max
+            // [64, N, 12]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            offload_func_v(KQV);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // [64, 12, N]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            offload_func_v(KQV_merged);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // [768, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            ggml_set_name(cur, "KQV_merge_cont");
+            offload_func_v(cur);
+        }
+
+        // projection
+        // [ 768, 768] - model.layers[il].c_attn_proj_w
+        // [ 768,   1] - model.layers[il].c_attn_proj_b
+        // [ 768,   N] - cur (in)
+        // [ 768,   N] - cur (out)
+        //
+        // cur = proj_w*cur + proj_b
+        // [768, N]
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+            ggml_set_name(cur, "attn_proj");
+            offload_func(cur);
+
+            cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_attn_proj_b);
+            ggml_set_name(cur, "attn_bias");
+            offload_func(cur);
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+        offload_func(cur);
+        ggml_set_name(cur, "after attn");
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            ggml_tensor *idx = nullptr;
+            ggml_tensor *idx_g = nullptr;
+            ggml_tensor *cur_c = nullptr;
+            
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN");
+                // cur = ln_2_g*cur + ln_2_b
+                // [ 768, N]
+                cur = ggml_mul(ctx0,
+                            cur,
+                            model.layers[il].ln_2_g);
+                offload_func(cur);
+                ggml_set_name(cur, "norm_FFN_g");
+                cur = ggml_add(ctx0,
+                        cur, 
+                        model.layers[il].ln_2_b);
+                // offload_func(cur);
+                // ggml_set_name(cur, "norm_FFN_w");
+                // cur_c = ggml_dup(ctx0, cur);
+            }
+            // if (N == 1)
+            if (1)
+            {
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w1_w,
+                                   cur);
+                offload_func(idx);
+                ggml_set_name(idx, "mlp_pre_w1");
+                idx = ggml_relu(ctx0, idx);
+                offload_func(idx);
+                ggml_set_name(idx, "relu_pre");
+                idx = ggml_mul_mat(ctx0,
+                                   model.layers[il].mlp_pre_w2_w,
+                                   idx);
+                ggml_set_name(idx, "mlp_pre_w2");
+                // offload_func(idx);
+                // idx = ggml_sigmoid(ctx0, idx);
+                // offload_func(idx);
+                // idx_g = idx;
+                // idx = ggml_dup(ctx0, idx_g);
+                // ggml_set_name(idx, "idx_cpu_dup");
+            }
+
+            // fully connected
+            // [3072, 768] - model.layers[il].c_mlp_fc_w
+            // [3072,   1] - model.layers[il].c_mlp_fc_b
+            // [ 768,   N] - cur (in)
+            // [3072,   N] - cur (out)
+            //
+            // cur = fc_w*cur + fc_b
+            // [3072, N]
+            if (N >= 80)
+            // if (0)
+            {
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_fc_w,
+                                   cur);
+                offload_func(cur);
+                ggml_set_name(cur, "up_ffn");
+                cur = ggml_add(ctx0,
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
+                offload_func(cur);
+            }
+            else 
+            {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_fc_w,
+                //                    cur);
+                // offload_func(cur);
+                // cur = ggml_add(ctx0,
+                //     cur,
+                //     model.layers[il].c_mlp_fc_b);
+                // offload_func(cur);
+
+                
+                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
+                model.layers[il].c_mlp_fc_w_gpu,
+                cur,
+                idx,
+                model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "mlp_up_gpu");
+                offload_func(tmp);
+                offload_debug(tmp);
+                cur = ggml_mul_mat_idx(ctx0,
+                                       model.layers[il].c_mlp_fc_w,
+                                       cur,
+                                       idx,
+                                       model.layers[il].gpu_idx);
+                ggml_set_name(cur, "mlp_up_cpu");
+                tmp = ggml_add_idx(ctx0,
+                    tmp,
+                    model.layers[il].c_mlp_fc_b,
+                    idx);
+                ggml_set_name(tmp, "mlp_up_bias");
+                offload_debug(tmp);
+                offload_func(tmp);
+
+            cur = ggml_add(ctx0, cur, tmp);
+            ggml_set_name(cur, "mlp_up_mix");
+            offload_func(cur);
+
+                // cur = tmp;
+
+            }
+
+            
+
+            // GELU activation
+            // [3072, N]
+            cur = ggml_relu(ctx0, cur);
+            // cur_c = cur;
+            // offload_func(cur);
+            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
+
+            // projection
+            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+            // [ 768,    1] - model.layers[il].c_mlp_proj_b
+            // [3072,    N] - cur (in)
+            // [ 768,    N] - cur (out)
+            //
+            // cur = proj_w*cur + proj_b
+            // [768, N]
+            if (N >= 80) {
+            // if (0) { 
+                cur = ggml_mul_mat(ctx0,
+                                   model.layers[il].c_mlp_proj_w,
+                                   cur);
+                offload_func(cur);
+                ggml_set_name(cur, "down_ffn");
+
+                cur = ggml_add(ctx0,
+                               cur,
+                               model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+            }
+            else {
+                // cur = ggml_mul_mat(ctx0,
+                //                    model.layers[il].c_mlp_proj_w,
+                //                    cur);
+                // offload_func(cur);
+                
+                // cur = ggml_axpy(ctx0, 
+                // model.layers[il].c_mlp_proj_w_t,
+                // cur,
+                // NULL,
+                // NULL);
+                // offload_func(cur);
+
+
+                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
+                // model.layers[il].c_mlp_proj_w_gpu,
+                // cur,
+                // model.layers[il].gpu_bucket,
+                // NULL);
+                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
+                    model.layers[il].c_mlp_proj_w_gpu,
+                    cur,
+                    idx,
+                    model.layers[il].gpu_bucket);
+                ggml_set_name(tmp, "axpy");
+                offload_func(tmp);
+                offload_debug(tmp);
+
+                cur = ggml_axpy(ctx0, 
+                model.layers[il].c_mlp_proj_w_t,
+                cur_c,
+                idx,
+                model.layers[il].gpu_idx);
+
+                cur = ggml_add(ctx0, cur, tmp);
+                offload_func(cur);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
+                offload_func(cur);
+                
+                // tmp = ggml_add(ctx0,
+                //                tmp,
+                //                model.layers[il].c_mlp_proj_b);
+                // offload_func(tmp);
+                // offload_debug(tmp);
+
+                // cur = tmp;
+            }
+            
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+        offload_func(inpL);
+    }
+
+    // norm
+    {
+        // [ 768, N]
+        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        offload_func_nr(inpL);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        // [ 768, N]
+        inpL = ggml_mul(ctx0,
+                    inpL,
+                    model.ln_f_g);
+        offload_func_nr(inpL);
+        inpL = ggml_add(ctx0,
+                inpL,
+                model.ln_f_b);
+        ggml_set_name(inpL, "before");
+        offload_func_nr(inpL);
+    }
+
+    // inpL = WTE * inpL
+    // [ 768, 50257] - model.lm_head
+    // [ 768, N]     - inpL
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    ggml_set_name(inpL, "last_layer");
+// offload_func(inpL);
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - allocr:    ggml_allocr to use to allocate the compute buffer
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+bool gpt2_eval(
+        const gpt2_model & model,
+        struct ggml_allocr * allocr,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_vocab;
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+#ifdef GGML_USE_CUBLAS
+    for (int i = 0; i < gf->n_leafs; i++) {
+        ggml_tensor * node = gf->leafs[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+
+    for (int i = 0; i < gf->n_nodes; i++) {
+        ggml_tensor * node = gf->nodes[i];
+        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
+            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
+        }
+    }
+#endif
+
+
+
+    // run the computation
+    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> work_buffer;
+    work_buffer.resize(plan.work_size);
+    plan.work_data = work_buffer.data();
+    ggml_graph_compute(gf, &plan);
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (gf);
+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+    //}
+
+    // in this case, the output tensor is the last one in the graph
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result just for the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    gpt_params params;
+    params.model = "models/gpt-2-117M/ggml-model.bin";
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    printf("%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.prompt.empty()) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    int64_t t_load_us = 0;
+
+    gpt_vocab vocab;
+    gpt2_model model;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_load(params.model, model, vocab, params)) {
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        t_load_us = ggml_time_us() - t_start_us;
+
+        test_gpt_tokenizer(vocab, "hello world");
+    }
+    printf("load finish\n");
+
+    // keep this buffer alive while evaluating the model
+
+    struct ggml_allocr * allocr = NULL;
+    // allocate the compute buffer
+    {
+        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+
+        // create the worst case graph for memory usage estimation
+        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
+        int n_past = model.hparams.n_ctx - n_tokens;
+        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(allocr);
+        // compute_buffer.resize(mem_size);
+        compute_buffer = ggml_cuda_host_malloc(mem_size);
+        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
+
+        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+    }
+
+    int n_past = 0;
+
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+
+    std::vector<float> logits;
+
+    // tokenize the prompt
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+        printf("%d ", embd_inp[i]);
+    }
+    printf("\n\n");
+
+    // submit the input prompt token-by-token
+    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+    std::vector<gpt_vocab::id> embd;
+
+    int cnt = 0;
+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+        // predict
+        if (embd.size() > 0) {
+            const int64_t t_start_us = ggml_time_us();
+
+            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
+                printf("Failed to predict\n");
+                return 1;
+            }
+            cnt += 1;
+
+            if (cnt > 0)
+                t_predict_us += ggml_time_us() - t_start_us;
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if (i >= embd_inp.size()) {
+            // sample next token
+            llama_sampling_params & sparams = params.sparams;
+            const int   top_k = sparams.top_k;
+            const float top_p = sparams.top_p;
+            const float temp  = sparams.temp;
+
+            const int n_vocab = model.hparams.n_vocab;
+
+            gpt_vocab::id id = 0;
+
+            {
+                const int64_t t_start_sample_us = ggml_time_us();
+
+                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+                t_sample_us += ggml_time_us() - t_start_sample_us;
+            }
+
+            // add it to the context
+            embd.push_back(id);
+        } else {
+            // if here, it means we are still processing the input prompt
+            for (size_t k = i; k < embd_inp.size(); k++) {
+                embd.push_back(embd_inp[k]);
+                if (int32_t(embd.size()) >= params.n_batch) {
+                    break;
+                }
+            }
+            i += embd.size() - 1;
+        }
+
+        // display text
+        for (auto id : embd) {
+            printf("%s", vocab.id_to_token[id].c_str());
+        }
+        fflush(stdout);
+
+        // end of text token
+        if (embd.back() == 50256) {
+            break;
+        }
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    ggml_free(model.ctx);
+
+    return 0;
+}
diff --git a/examples/gpt-2-sparse/quantize.cpp b/examples/gpt-2-sparse/quantize.cpp
new file mode 100644
index 00000000..f81c04e8
--- /dev/null
+++ b/examples/gpt-2-sparse/quantize.cpp
@@ -0,0 +1,184 @@
+#include "ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+    int32_t n_vocab = 50257;
+    int32_t n_ctx   = 1024;
+    int32_t n_embd  = 768;
+    int32_t n_head  = 12;
+    int32_t n_layer = 12;
+    int32_t ftype   = 1;
+};
+
+// quantize a model
+bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    gpt2_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
+        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
+        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
+        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        finp.read ((char *) &n_vocab, sizeof(n_vocab));
+        fout.write((char *) &n_vocab, sizeof(n_vocab));
+
+        if (n_vocab != hparams.n_vocab) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+            return false;
+        }
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        "model/wte",
+        "model/lm_head",
+        "model/h.*/attn/c_attn/w",
+        "model/h.*/attn/c_proj/w",
+        "model/h.*/mlp/c_fc/w",
+        "model/h.*/mlp/c_proj/w",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+// usage:
+//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}

From 98d035ebab161d4b3fedbdc98bbf1908e6621481 Mon Sep 17 00:00:00 2001
From: "a.r.l" <luminyouyazj@126.com>
Date: Thu, 16 Jan 2025 16:53:19 +0800
Subject: [PATCH 2/4] feat: add opt model into llama.cpp

---
 examples/gpt-2-sparse/CMakeLists.txt          |   15 -
 examples/gpt-2-sparse/README.md               |  158 --
 .../gpt-2-sparse/convert-cerebras-to-ggml.py  |  183 --
 examples/gpt-2-sparse/convert-ckpt-to-ggml.py |  159 --
 examples/gpt-2-sparse/convert-h5-to-ggml.py   |  195 --
 examples/gpt-2-sparse/download-ggml-model.sh  |   69 -
 examples/gpt-2-sparse/download-model.sh       |   48 -
 examples/gpt-2-sparse/main-30b.cpp            | 1593 -----------------
 examples/gpt-2-sparse/main.cpp_123            | 1592 ----------------
 examples/gpt-2-sparse/main.cpp_bak            | 1546 ----------------
 examples/gpt-2-sparse/main13b.cpp             | 1583 ----------------
 examples/gpt-2-sparse/main7b.cpp              | 1567 ----------------
 examples/gpt-2-sparse/quantize.cpp            |  184 --
 llama.cpp                                     |  213 +++
 14 files changed, 213 insertions(+), 8892 deletions(-)
 delete mode 100644 examples/gpt-2-sparse/CMakeLists.txt
 delete mode 100644 examples/gpt-2-sparse/README.md
 delete mode 100644 examples/gpt-2-sparse/convert-cerebras-to-ggml.py
 delete mode 100644 examples/gpt-2-sparse/convert-ckpt-to-ggml.py
 delete mode 100644 examples/gpt-2-sparse/convert-h5-to-ggml.py
 delete mode 100755 examples/gpt-2-sparse/download-ggml-model.sh
 delete mode 100755 examples/gpt-2-sparse/download-model.sh
 delete mode 100644 examples/gpt-2-sparse/main-30b.cpp
 delete mode 100644 examples/gpt-2-sparse/main.cpp_123
 delete mode 100644 examples/gpt-2-sparse/main.cpp_bak
 delete mode 100644 examples/gpt-2-sparse/main13b.cpp
 delete mode 100644 examples/gpt-2-sparse/main7b.cpp
 delete mode 100644 examples/gpt-2-sparse/quantize.cpp

diff --git a/examples/gpt-2-sparse/CMakeLists.txt b/examples/gpt-2-sparse/CMakeLists.txt
deleted file mode 100644
index a06b42dc..00000000
--- a/examples/gpt-2-sparse/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-#
-# gpt-2
-
-set(TEST_TARGET gpt-2-sparse)
-add_executable(${TEST_TARGET} main7b.cpp)
-# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-target_link_libraries(${TEST_TARGET} PRIVATE common  llama ${CMAKE_THREAD_LIBS_INIT})
-
-#
-# gpt-2-quantize
-
-set(TEST_TARGET gpt-2-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-# target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
diff --git a/examples/gpt-2-sparse/README.md b/examples/gpt-2-sparse/README.md
deleted file mode 100644
index 509fabc5..00000000
--- a/examples/gpt-2-sparse/README.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# gpt-2
-
-This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.
-
-The program runs on the CPU - no video card is required.
-
-The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.
-
-The example supports the following GPT-2 models:
-
-| Model | Description  | Disk Size |
-| ---   | ---          | ---       |
-| 117M  | Small model  | 240 MB    |
-| 345M  | Medium model | 680 MB    |
-| 774M  | Large model  | 1.5 GB    |
-| 1558M | XL model     | 3.0 GB    |
-
-Sample performance on MacBook M1 Pro:
-
-| Model | Size  | Time / Token |
-| ---   | ---   | ---    |
-| GPT-2 |  117M |   5 ms |
-| GPT-2 |  345M |  12 ms |
-| GPT-2 |  774M |  23 ms |
-| GPT-2 | 1558M |  42 ms |
-
-*TODO: add tables for Cerebras-GPT models*
-
-Sample output:
-
-```
-$ ./bin/gpt-2 -h
-usage: ./bin/gpt-2 [options]
-
-options:
-  -h, --help            show this help message and exit
-  -s SEED, --seed SEED  RNG seed (default: -1)
-  -t N, --threads N     number of threads to use during computation (default: 8)
-  -p PROMPT, --prompt PROMPT
-                        prompt to start generation with (default: random)
-  -n N, --n_predict N   number of tokens to predict (default: 200)
-  --top_k N             top-k sampling (default: 40)
-  --top_p N             top-p sampling (default: 0.9)
-  --temp N              temperature (default: 1.0)
-  -b N, --batch_size N  batch size for prompt processing (default: 8)
-  -m FNAME, --model FNAME
-                        model path (default: models/gpt-2-117M/ggml-model.bin)
-
-$ ./bin/gpt-2
-gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
-gpt2_model_load: n_vocab = 50257
-gpt2_model_load: n_ctx   = 1024
-gpt2_model_load: n_embd  = 768
-gpt2_model_load: n_head  = 12
-gpt2_model_load: n_layer = 12
-gpt2_model_load: f16     = 1
-gpt2_model_load: ggml ctx size = 311.12 MB
-gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
-gpt2_model_load: model size  =   239.08 MB
-main: number of tokens in prompt = 1
-
-So this is going to be the end of the line for us.
-
-If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.
-
-Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.
-
-We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>
-
-main: mem per token =  2048612 bytes
-main:     load time =   106.32 ms
-main:   sample time =     7.10 ms
-main:  predict time =   506.40 ms / 5.06 ms per token
-main:    total time =   629.84 ms
-```
-
-## Downloading and converting the original models (GPT-2)
-
-You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
-in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
-via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.
-
-Here is the entire process for the GPT-2 117M model (download from official site + conversion):
-
-```
-cd ggml/build
-../examples/gpt-2/download-model.sh 117M
-
-Downloading model 117M ...
-models/gpt-2-117M/checkpoint                      100%[=============================>]      77  --.-KB/s    in 0s
-models/gpt-2-117M/encoder.json                    100%[=============================>]   1018K  1.20MB/s    in 0.8s
-models/gpt-2-117M/hparams.json                    100%[=============================>]      90  --.-KB/s    in 0s
-models/gpt-2-117M/model.ckpt.data-00000-of-00001  100%[=============================>] 474.70M  1.21MB/s    in 8m 39s
-models/gpt-2-117M/model.ckpt.index                100%[=============================>]   5.09K  --.-KB/s    in 0s
-models/gpt-2-117M/model.ckpt.meta                 100%[=============================>] 460.11K   806KB/s    in 0.6s
-models/gpt-2-117M/vocab.bpe                       100%[=============================>] 445.62K   799KB/s    in 0.6s
-Done! Model '117M' saved in 'models/gpt-2-117M/'
-
-Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.
-
-  python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1
-
-```
-
-This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
-this, you can download the already converted ggml models as described below.
-
-## Downloading and converting the original models (Cerebras-GPT)
-
-Clone the respective repository from here: https://huggingface.co/cerebras
-
-Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:
-
-```
-cd ggml/build
-git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
-python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/
-
-```
-
-## Downloading the ggml model directly (GPT-2)
-
-For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
-way, you can directly download a single binary file and start using it. No python or Tensorflow is required.
-
-Here is how to get the 117M ggml model:
-
-```
-cd ggml/build
-../examples/gpt-2/download-ggml-model.sh 117M
-
-Downloading ggml model 117M ...
-models/gpt-2-117M/ggml-model.bin         100%[===============================>] 239.58M  8.52MB/s    in 28s
-Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
-You can now use it like this:
-
-  $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
-
-```
-
-At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.
-
-## Quantizing the models
-
-You can also try to quantize the `ggml` models via 4-bit integer quantization.
-Keep in mind that for smaller models, this will render them completely useless.
-You generally want to quantize larger models.
-
-```
-# quantize GPT-2 F16 to Q4_0 (faster but less precise)
-./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
-./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"
-
-# quantize Cerebras F16 to Q4_1 (slower but more precise)
-./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
-./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"
-
-```
diff --git a/examples/gpt-2-sparse/convert-cerebras-to-ggml.py b/examples/gpt-2-sparse/convert-cerebras-to-ggml.py
deleted file mode 100644
index 6057f81c..00000000
--- a/examples/gpt-2-sparse/convert-cerebras-to-ggml.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Convert Cerebras models to ggml format
-#
-# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
-#
-
-import sys
-import struct
-import json
-import torch
-import numpy as np
-import re
-
-from transformers import AutoModelForCausalLM
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 2:
-    print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model-f16.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 2:
-    use_f16 = False
-    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
-
-model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-print(hparams)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # rename headers to keep compatibility
-    if name == "transformer.ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "transformer.ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "transformer.wte.weight":
-        name = "model/wte"
-    elif name == "transformer.wpe.weight":
-        name = "model/wpe"
-    elif name == "lm_head.weight":
-        name = "model/lm_head"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    # for efficiency - transpose the projection matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or \
-       name[-14:] == "/attn/c_proj/w" or \
-       name[-11:] == "/mlp/c_fc/w" or \
-       name[-13:] == "/mlp/c_proj/w":
-        print("  Transposing")
-        data = data.transpose()
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/examples/gpt-2-sparse/convert-ckpt-to-ggml.py b/examples/gpt-2-sparse/convert-ckpt-to-ggml.py
deleted file mode 100644
index 9113141f..00000000
--- a/examples/gpt-2-sparse/convert-ckpt-to-ggml.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Convert a model checkpoint to a ggml compatible file
-#
-# Load the model using TensorFlow.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import json
-import struct
-import numpy as np
-import tensorflow as tf
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-# helper method to convert a numpy array to different float types
-def convert_to_ftype(data, ftype):
-    # fp16
-    if ftype == 1:
-        return data.astype(np.float16)
-
-    assert False, "Invalid ftype: " + str(ftype)
-
-if len(sys.argv) < 3:
-    print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-list_vars = tf.train.list_variables(dir_model)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["n_vocab"]))
-fout.write(struct.pack("i", hparams["n_ctx"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", ftype))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name, shape in list_vars:
-    print("Processing variable: " + name + " with shape: ", shape)
-
-    data = tf.train.load_variable(dir_model, name).squeeze()
-    n_dims = len(data.shape);
-
-    # for efficiency - transpose the projection matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or \
-       name[-14:] == "/attn/c_proj/w" or \
-       name[-11:] == "/mlp/c_fc/w" or \
-       name[-13:] == "/mlp/c_proj/w":
-        print("  Transposing")
-        data = data.transpose()
-
-    dshape = data.shape
-
-    ftype_cur = 0
-    if ftype != 0:
-        # match name:
-        #  "model/wte"
-        #  "model/h.*/attn/c_attn/w"
-        #  "model/h.*/attn/c_proj/w"
-        #  "model/h.*/mlp/c_fc/w"
-        #  "model/h.*/mlp/c_proj/w"
-        if name == "model/wte" or name[-2:] == "/w":
-            print("  Converting to " + ftype_str[ftype])
-            data = convert_to_ftype(data, ftype)
-            ftype_cur = ftype
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/examples/gpt-2-sparse/convert-h5-to-ggml.py b/examples/gpt-2-sparse/convert-h5-to-ggml.py
deleted file mode 100644
index 6a2b8654..00000000
--- a/examples/gpt-2-sparse/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Convert GPT-2 h5 transformer model to ggml format
-#
-# Load the model using GPT2Model.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import struct
-import json
-import numpy as np
-import re
-
-from transformers import GPT2Model
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 2:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
-    encoder_added = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 2:
-    use_f16 = False
-    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
-
-model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-#fout.write(struct.pack("i", hparams["rotary_dim"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for key in encoder_added:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    # for efficiency - transpose these matrices:
-    #  "transformer.h.*.mlp.c_proj.weight
-    if name.endswith(".mlp.c_proj.weight"):
-        print("  Transposing")
-        data = data.transpose()
-
-    # rename headers to keep compatibility
-    if name == "ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "wte.weight":
-        name = "model/wte"
-    elif name == "wpe.weight":
-        name = "model/wpe"
-    elif re.match(r"h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    str = name.encode('utf-8')
-
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/examples/gpt-2-sparse/download-ggml-model.sh b/examples/gpt-2-sparse/download-ggml-model.sh
deleted file mode 100755
index 3aae015b..00000000
--- a/examples/gpt-2-sparse/download-ggml-model.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# This script downloads GPT-2 model files that have already been converted to ggml format.
-# This way you don't have to convert them yourself.
-#
-# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.
-
-#src="https://ggml.ggerganov.com"
-#pfx="ggml-model-gpt-2"
-
-src="https://huggingface.co/ggerganov/ggml"
-pfx="resolve/main/ggml-model-gpt-2"
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-2 models
-models=( "117M" "345M" "774M" "1558M" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download ggml model
-
-printf "Downloading ggml model $model ...\n"
-
-mkdir -p models/gpt-2-$model
-
-if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
-elif [ -x "$(command -v curl)" ]; then
-    curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
-else
-    printf "Either wget or curl is required to download models.\n"
-    exit 1
-fi
-
-if [ $? -ne 0 ]; then
-    printf "Failed to download ggml model $model \n"
-    printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
-    exit 1
-fi
-
-printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
-printf "You can now use it like this:\n\n"
-printf "  $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
-printf "\n"
diff --git a/examples/gpt-2-sparse/download-model.sh b/examples/gpt-2-sparse/download-model.sh
deleted file mode 100755
index f0c62f4f..00000000
--- a/examples/gpt-2-sparse/download-model.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-2 models
-models=( "117M" "345M" "774M" "1558M" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download model
-
-printf "Downloading model $model ...\n"
-
-mkdir -p models/gpt-2-$model
-
-for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
-    wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
-done
-
-printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
-printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
-printf "\n"
-printf "  python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
-printf "\n"
diff --git a/examples/gpt-2-sparse/main-30b.cpp b/examples/gpt-2-sparse/main-30b.cpp
deleted file mode 100644
index 73eeff25..00000000
--- a/examples/gpt-2-sparse/main-30b.cpp
+++ /dev/null
@@ -1,1593 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include <regex>
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include "ggml-cuda.h"
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
-    (void) tensor;
-}
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    // struct ggml_tensor * c_attn_attn_w;
-    // struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_attn_q_w;
-    struct ggml_tensor * c_attn_attn_q_b;
-
-    struct ggml_tensor * c_attn_attn_k_w;
-    struct ggml_tensor * c_attn_attn_k_b;
-
-    struct ggml_tensor * c_attn_attn_v_w;
-    struct ggml_tensor * c_attn_attn_v_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-
-    struct ggml_tensor * gpu_idx;
-    struct ggml_tensor * gpu_bucket;
-    // gpu heat
-    struct ggml_tensor * c_mlp_fc_w_gpu;
-    struct ggml_tensor * c_mlp_proj_w_t;
-    struct ggml_tensor * c_mlp_proj_w_gpu;
-
-    //predictor
-    struct ggml_tensor * mlp_pre_w1_w;
-    struct ggml_tensor * mlp_pre_w2_w;
-};
-
-struct opt_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    opt_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error("opt_file fail\n");
-		}
-		seek(0, SEEK_END);
-		size = tell();
-		seek(0, SEEK_SET);
-    }
-	size_t tell() const {
-#ifdef _WIN32
-		__int64 ret = _ftelli64(fp);
-#else
-		long ret = std::ftell(fp);
-#endif
-		GGML_ASSERT(ret != -1); // this really shouldn't fail
-		return (size_t) ret;
-	}
-
-	void seek(size_t offset, int whence) {
-#ifdef _WIN32
-		int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-		int ret = std::fseek(fp, (long) offset, whence);
-#endif
-		GGML_ASSERT(ret == 0); // same
-	}
-
-    ~opt_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-#define _POSIX_MAPPED_FILES
-#include <sys/types.h>
-#include <sys/mman.h>
-
-struct opt_mmap {
-    void * addr;
-    size_t size;
-
-    opt_mmap(const opt_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
-#ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error("mmap failed\n");
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~opt_mmap() {
-        munmap(addr, size);
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
-        (void) prefetch;
-        (void) numa;
-
-        throw std::runtime_error(std::string("mmap not supported"));
-    }
-#endif
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-    struct opt_file * file;
-    struct opt_mmap * mapping;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor **> tensors;
-};
-
-struct ggml_context * ctx0 = nullptr;
-// std::vector<uint8_t> compute_buffer;
-void *compute_buffer;
-
-bool endsWith(const std::string& str, const std::string& suffix) {
-    if (str.length() < suffix.length()) {
-        return false;
-    }
-    return str.substr(str.length() - suffix.length()) == suffix;
-}
-
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-    model.file = new opt_file(fname.c_str(), "rb");
-    printf("size %d\n", model.file->size);
-    model.mapping = new opt_mmap(model.file, 0, false);
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        /* int32_t n_vocab = 0; */
-        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
-
-        /* if (n_vocab != model.hparams.n_vocab) { */
-        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
-        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
-        /*     return false; */
-        /* } */
-        int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-    printf("wtype %d\n", wtype);
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        //need refactor
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
-        //predictor
-        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size = 0;
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*51200; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-    int main_gpu = 0;
-#if defined(GGML_USE_CUBLAS)
-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
-    ggml_cuda_set_main_device(main_gpu);
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
-#else
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
-#endif
-    
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
-        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
-
-        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
-        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        
-        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
-
-        // map by name
-        model.tensors["output_norm.weight"] = &model.ln_f_g;
-        model.tensors["output_norm.bias"] = &model.ln_f_b;
-
-        model.tensors["tok_embeddings.weight"]     = &model.wte;
-        model.tensors["pos_embeddings.weight"]     = &model.wpe;
-        model.tensors["output.weight"] = &model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-            memset(&layer, 0, sizeof(gpt2_layer));
-
-        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-        //     // need refine
-        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
-        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
-        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
-
-        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
-        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
-
-        //     if (i <= 10) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
-        //     } else if (i <= 12) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
-        //     } else if (i <= 18) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
-
-        //     } else if (i <= 21) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
-        //     } else if (i <= 26) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
-        //     } else if (i <= 31) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
-        //     }
-
-        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
-
-        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
-
-            // map by name
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
-
-            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
-            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
-            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
-            
-            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
-            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
-        }
-    }
-
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        #ifdef GGML_USE_CUBLAS
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
-        #endif
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-    ggml_set_no_alloc(ctx, true);
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-        const std::vector<std::string> to_gpu = {
-                "output_norm.bias",
-                "output_norm.weight",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                // ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                "layers.*output_norm.weight",
-                "layers.*output_norm.bias",
-                ".*fc1.weight",
-                ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_gpu_lv = {
-                // ".*attention.wq.weight",
-                // ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                // "layers.*output_norm.weight",
-                // "layers.*output_norm.bias",
-                ".*fc1.weight",
-                ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_lock = {
-                "tok_embeddings.weight",
-                "pos_embeddings.weight",
-                // "output_norm.bias",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                // ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                // ".*attention.wo.bias",
-                ".*feed_forward.w1.weight",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_transpose",
-                // ".*feed_forward.w2.weight",
-                ".*feed_forward.w2.bias",
-                ".*gpu.weight",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                ".*output_norm.weight",
-                ".*output_norm.bias",
-                ".*attention.*fc1.weight",
-                ".*attention.*fc1.bias",
-                ".*attention.*fc2.weight",
-                ".*attention.*fc2.bias",
-                // ".*w2.bias",
-                // ".*w1.bias",
-                "output.weight",
-            };
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            int64_t new_ne[2];
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-                new_ne[i] = ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-            ggml_tensor ** ptr = model.tensors[name];
-            // printf("name %s ptr %p\n", name.c_str(), *ptr);
-            // int k;
-            // scanf("%d", &k);
-            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
-
-            auto tensor = (ggml_tensor *)*model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-            
-
-            // for debugging
-            if (1) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            std::streampos offset = fin.tellg();
-            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
-            // if ( endsWith(name.c_str(), "weight_transpose")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i+4096]);
-            //     }
-            // }
-            // printf("\n");
-            // if (endsWith(name.c_str(), "weight_h20")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i]);
-
-            //     }
-            //     int k;
-            //     scanf("%d", &k);
-            // }
-
-            // // GPT-2 models share the WTE tensor as the LM head
-            // if (name == "model/wte" && has_lm_head == false) {
-            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            // }
-
-            // if (name == "model/lm_head") {
-            //     has_lm_head = true;
-            // }
-            if (model_params.low_vram == false) {
-                for (const auto &s : to_gpu)
-                {
-                    // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight")))
-                    // {
-                    //     std::regex pattern(R"(\d+)");
-                    //     std::smatch match;
-                    //     int layer_id = 0;
-                    //     if (std::regex_search(name, match, pattern))
-                    //     {
-                    //         std::string digitStr = match.str();
-                    //         int num = std::stoi(digitStr);
-                    //         layer_id = num;
-                    //     }
-                    //     printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                    //     if (layer_id > model_params.n_gpu_layers)
-                    //         break;
-                    // }
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-            } else {
-                for (const auto &s : to_gpu_lv)
-                {
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        std::regex pattern(R"(\d+)");
-                        std::smatch match;
-                        int layer_id = 0;
-                        if (std::regex_search(name, match, pattern))
-                        {
-                            std::string digitStr = match.str();
-                            int num = std::stoi(digitStr);
-                            layer_id = num;
-                        }
-                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                        if (layer_id > model_params.n_gpu_layers)
-                            break;
-                        // printf("name %s\n", name.c_str());
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-
-            }
-            if (tensor->backend == GGML_BACKEND_GPU) {
-                #if defined(GGML_USE_CUBLAS)
-                ggml_cuda_transform_tensor(tensor->data, tensor);
-                #endif
-            }
-            for (const auto &s : to_lock)
-            {
-                if (std::regex_match(name, std::regex(s)))
-                {
-                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
-                        // printf("mlock %s\n", name.c_str());
-                    }
-                    else {
-                        printf("mlock failed %s\n", name.c_str());
-                    }
-                }
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-        ggml_set_no_alloc(ctx, false);
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-    printf("load finish\n");
-    // int k;
-    // scanf("%d", &k);
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
-    // static std::vector<uint8_t> buf(buf_size);
-    static void * buf = ggml_cuda_host_malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i + 2;
-        }
-    }
-    offload_func_t offload_func = opt_nop;
-    offload_func_t offload_func_kq = opt_nop;
-    offload_func_t offload_func_v = opt_nop;
-    offload_func_t offload_func_nr = opt_nop;
-    offload_func_t offload_debug = opt_nop;
-#ifdef GGML_USE_CUBLAS
-    offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
-#endif
-    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // int k; 
-    // scanf("%d", &k); 
-
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(allocr, KQ_scale);
-    if (!ggml_allocr_is_measure(allocr)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-    ggml_set_name(inpL, "inpL_first");
-    // offload_func(inpL);
-
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-            offload_func(cur);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g);
-            offload_func(cur);
-            ggml_set_name(cur, "ln_1_g");
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].ln_1_b);
-            ggml_set_name(cur, "ln_1_b");
-            // offload_func(cur);
-            
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-
-        struct ggml_tensor *k_cpy = nullptr;
-        struct ggml_tensor *v_cpy = nullptr;
-        // self-attention
-        {
-            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
-            offload_func_kq(Qcur);
-            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
-            offload_func_kq(Kcur);
-            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
-            offload_func_kq(Kcur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
-            offload_func_v(Vcur);
-            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
-            offload_func_v(Vcur);
-
-            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-            offload_func_v(Vcur);
-
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                offload_func_kq(k);
-                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
-
-                offload_func_v(v);
-                k_cpy = ggml_cpy(ctx0, Kcur, k);
-                offload_func_kq(k_cpy);
-                ggml_set_name(k_cpy, "k_cpy");
-                v_cpy = ggml_cpy(ctx0, Vcur, v);
-                offload_func_v(v_cpy);
-                ggml_set_name(v_cpy, "v_cpy");
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
-            offload_func_kq(Qcur);
-             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-            offload_func_kq(Q);
-
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            // struct ggml_tensor * K =
-            //     ggml_permute(ctx0,
-            //             ggml_reshape_3d(ctx0,
-            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-            //                 n_embd/n_head, n_head, n_past + N),
-            //             0, 2, 1, 3);
-            
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, model.memory_k,
-                        128, n_past + N, n_head,
-                        ggml_element_size(model.memory_k)*n_embd,
-                        ggml_element_size(model.memory_k)*128,
-                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
-            K->src[1] = k_cpy;
-            offload_func_kq(K);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        KQ_scale);
-            offload_func_kq(KQ_scaled);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            offload_func_kq(KQ_masked);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, 128, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*128,
-                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
-            V->src[1] = v_cpy;
-            offload_func_v(V);
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-            ggml_set_name(cur, "KQV_merge_cont");
-            offload_func_v(cur);
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-            ggml_set_name(cur, "attn_proj");
-            offload_func(cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-            ggml_set_name(cur, "attn_bias");
-            offload_func(cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
-        ggml_set_name(cur, "after attn");
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            ggml_tensor *idx = nullptr;
-            ggml_tensor *idx_g = nullptr;
-            ggml_tensor *cur_c = nullptr;
-            
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN");
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN_g");
-                cur = ggml_add(ctx0,
-                        cur, 
-                        model.layers[il].ln_2_b);
-                // offload_func(cur);
-                // ggml_set_name(cur, "norm_FFN_w");
-                // cur_c = ggml_dup(ctx0, cur);
-            }
-            // if (N == 1)
-            if (1)
-            {
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w1_w,
-                                   inpFF);
-                offload_func(idx);
-                ggml_set_name(idx, "mlp_pre_w1");
-                idx = ggml_relu(ctx0, idx);
-                offload_func(idx);
-                ggml_set_name(idx, "relu_pre");
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w2_w,
-                                   idx);
-                ggml_set_name(idx, "mlp_pre_w2");
-                // offload_func(idx);
-                // idx = ggml_sigmoid(ctx0, idx);
-                // offload_func(idx);
-                // idx_g = idx;
-                // idx = ggml_dup(ctx0, idx_g);
-                // ggml_set_name(idx, "idx_cpu_dup");
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            if (N >= 80)
-            // if (0)
-            {
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_fc_w,
-                                   cur);
-                offload_debug(cur);
-                offload_func(cur);
-                ggml_set_name(cur, "up_ffn");
-                cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-                offload_debug(cur);
-                offload_func(cur);
-            }
-            else 
-            {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_fc_w,
-                //                    cur);
-                // offload_func(cur);
-                // cur = ggml_add(ctx0,
-                //     cur,
-                //     model.layers[il].c_mlp_fc_b);
-                // offload_func(cur);
-
-                
-                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
-                model.layers[il].c_mlp_fc_w_gpu,
-                cur,
-                idx,
-                model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "mlp_up_gpu");
-                offload_func(tmp);
-                offload_debug(tmp);
-                cur = ggml_mul_mat_idx(ctx0,
-                                       model.layers[il].c_mlp_fc_w,
-                                       cur,
-                                       idx,
-                                       model.layers[il].gpu_idx);
-                ggml_set_name(cur, "mlp_up_cpu");
-                cur = ggml_add_idx(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b,
-                    idx);
-                ggml_set_name(tmp, "mlp_up_bias");
-                offload_debug(tmp);
-                offload_func(tmp);
-
-            cur = ggml_add(ctx0, cur, tmp);
-            ggml_set_name(cur, "mlp_up_mix");
-            offload_func(cur);
-
-                // cur = tmp;
-
-            }
-
-            
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_relu(ctx0, cur);
-            // cur_c = cur;
-            // offload_func(cur);
-            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            if (N >= 80) {
-            // if (0) { 
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_proj_w,
-                //                    cur);
-                cur = ggml_axpy(ctx0,
-                                   model.layers[il].c_mlp_proj_w_t,
-                                   cur,
-                                   NULL,
-                                   NULL);
-                offload_debug(cur);
-                offload_func(cur);
-                ggml_set_name(cur, "down_ffn");
-
-                cur = ggml_add(ctx0,
-                               cur,
-                               model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-                offload_debug(cur);
-            }
-            else {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_proj_w,
-                //                    cur);
-                // offload_func(cur);
-                
-                // cur = ggml_axpy(ctx0, 
-                // model.layers[il].c_mlp_proj_w_t,
-                // cur,
-                // NULL,
-                // NULL);
-                // offload_func(cur);
-
-
-                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
-                // model.layers[il].c_mlp_proj_w_gpu,
-                // cur,
-                // model.layers[il].gpu_bucket,
-                // NULL);
-                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
-                    model.layers[il].c_mlp_proj_w_gpu,
-                    cur,
-                    idx,
-                    model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "axpy");
-                offload_func(tmp);
-                offload_debug(tmp);
-                cur = ggml_axpy(ctx0, 
-                model.layers[il].c_mlp_proj_w_t,
-                cur_c,
-                idx,
-                model.layers[il].gpu_idx);
-
-                cur = ggml_add(ctx0, cur, tmp);
-                offload_func(cur);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-                
-                // tmp = ggml_add(ctx0,
-                //                tmp,
-                //                model.layers[il].c_mlp_proj_b);
-                // offload_func(tmp);
-                // offload_debug(tmp);
-
-                // cur = tmp;
-            }
-            
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-        offload_func(inpL);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-        offload_func_nr(inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g);
-        offload_func_nr(inpL);
-        inpL = ggml_add(ctx0,
-                inpL,
-                model.ln_f_b);
-        ggml_set_name(inpL, "before");
-        offload_func_nr(inpL);
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-    ggml_set_name(inpL, "last_layer");
-// offload_func(inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-#ifdef GGML_USE_CUBLAS
-    for (int i = 0; i < gf->n_leafs; i++) {
-        ggml_tensor * node = gf->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-#endif
-
-
-
-    // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, "hello world");
-    }
-    printf("load finish\n");
-
-    // keep this buffer alive while evaluating the model
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        // compute_buffer.resize(mem_size);
-        compute_buffer = ggml_cuda_host_malloc(mem_size);
-        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
-        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    int cnt = 0;
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-            cnt += 1;
-
-            if (cnt > 0)
-                t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            llama_sampling_params & sparams = params.sparams;
-            const int   top_k = sparams.top_k;
-            const float top_p = sparams.top_p;
-            const float temp  = sparams.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/gpt-2-sparse/main.cpp_123 b/examples/gpt-2-sparse/main.cpp_123
deleted file mode 100644
index 4deed1df..00000000
--- a/examples/gpt-2-sparse/main.cpp_123
+++ /dev/null
@@ -1,1592 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include <regex>
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include "ggml-cuda.h"
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
-    (void) tensor;
-}
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    // struct ggml_tensor * c_attn_attn_w;
-    // struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_attn_q_w;
-    struct ggml_tensor * c_attn_attn_q_b;
-
-    struct ggml_tensor * c_attn_attn_k_w;
-    struct ggml_tensor * c_attn_attn_k_b;
-
-    struct ggml_tensor * c_attn_attn_v_w;
-    struct ggml_tensor * c_attn_attn_v_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-
-    struct ggml_tensor * gpu_idx;
-    struct ggml_tensor * gpu_bucket;
-    // gpu heat
-    struct ggml_tensor * c_mlp_fc_w_gpu;
-    struct ggml_tensor * c_mlp_proj_w_t;
-    struct ggml_tensor * c_mlp_proj_w_gpu;
-
-    //predictor
-    struct ggml_tensor * mlp_pre_w1_w;
-    struct ggml_tensor * mlp_pre_w2_w;
-};
-
-struct opt_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    opt_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error("opt_file fail\n");
-		}
-		seek(0, SEEK_END);
-		size = tell();
-		seek(0, SEEK_SET);
-    }
-	size_t tell() const {
-#ifdef _WIN32
-		__int64 ret = _ftelli64(fp);
-#else
-		long ret = std::ftell(fp);
-#endif
-		GGML_ASSERT(ret != -1); // this really shouldn't fail
-		return (size_t) ret;
-	}
-
-	void seek(size_t offset, int whence) {
-#ifdef _WIN32
-		int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-		int ret = std::fseek(fp, (long) offset, whence);
-#endif
-		GGML_ASSERT(ret == 0); // same
-	}
-
-    ~opt_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-#define _POSIX_MAPPED_FILES
-#include <sys/types.h>
-#include <sys/mman.h>
-
-struct opt_mmap {
-    void * addr;
-    size_t size;
-
-    opt_mmap(const opt_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
-#ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error("mmap failed\n");
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~opt_mmap() {
-        munmap(addr, size);
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
-        (void) prefetch;
-        (void) numa;
-
-        throw std::runtime_error(std::string("mmap not supported"));
-    }
-#endif
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-    struct opt_file * file;
-    struct opt_mmap * mapping;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor **> tensors;
-};
-
-struct ggml_context * ctx0 = nullptr;
-// std::vector<uint8_t> compute_buffer;
-void *compute_buffer;
-
-bool endsWith(const std::string& str, const std::string& suffix) {
-    if (str.length() < suffix.length()) {
-        return false;
-    }
-    return str.substr(str.length() - suffix.length()) == suffix;
-}
-
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-    model.file = new opt_file(fname.c_str(), "rb");
-    printf("size %d\n", model.file->size);
-    model.mapping = new opt_mmap(model.file, 0, false);
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        /* int32_t n_vocab = 0; */
-        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
-
-        /* if (n_vocab != model.hparams.n_vocab) { */
-        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
-        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
-        /*     return false; */
-        /* } */
-        int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-    printf("wtype %d\n", wtype);
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        //need refactor
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
-        //predictor
-        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size = 0;
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*51200; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-    int main_gpu = 0;
-#if defined(GGML_USE_CUBLAS)
-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
-    ggml_cuda_set_main_device(main_gpu);
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
-#else
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
-#endif
-    
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
-        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
-
-        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
-        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        
-        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
-
-        // map by name
-        model.tensors["output_norm.weight"] = &model.ln_f_g;
-        model.tensors["output_norm.bias"] = &model.ln_f_b;
-
-        model.tensors["tok_embeddings.weight"]     = &model.wte;
-        model.tensors["pos_embeddings.weight"]     = &model.wpe;
-        model.tensors["output.weight"] = &model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-            memset(&layer, 0, sizeof(gpt2_layer));
-
-        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-        //     // need refine
-        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
-        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
-        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
-
-        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
-        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
-
-        //     if (i <= 10) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
-        //     } else if (i <= 12) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
-        //     } else if (i <= 18) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
-
-        //     } else if (i <= 21) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
-        //     } else if (i <= 26) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
-        //     } else if (i <= 31) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
-        //     }
-
-        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
-
-        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
-
-            // map by name
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
-
-            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
-            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
-            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
-            
-            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
-            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
-        }
-    }
-
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        #ifdef GGML_USE_CUBLAS
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
-        #endif
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-    ggml_set_no_alloc(ctx, true);
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-        const std::vector<std::string> to_gpu = {
-                "output_norm.bias",
-                "output_norm.weight",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                // ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                "layers.*output_norm.weight",
-                "layers.*output_norm.bias",
-                ".*fc1.weight",
-                ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_gpu_lv = {
-                // ".*attention.wq.weight",
-                // ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                // "layers.*output_norm.weight",
-                // "layers.*output_norm.bias",
-                ".*fc1.weight",
-                ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_lock = {
-                "tok_embeddings.weight",
-                "pos_embeddings.weight",
-                // "output_norm.bias",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                // ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                // ".*attention.wo.bias",
-                ".*feed_forward.w1.weight",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_transpose",
-                // ".*feed_forward.w2.weight",
-                ".*feed_forward.w2.bias",
-                ".*gpu.weight",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                ".*output_norm.weight",
-                ".*output_norm.bias",
-                ".*attention.*fc1.weight",
-                ".*attention.*fc1.bias",
-                ".*attention.*fc2.weight",
-                ".*attention.*fc2.bias",
-                // ".*w2.bias",
-                // ".*w1.bias",
-                "output.weight",
-            };
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            int64_t new_ne[2];
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-                new_ne[i] = ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-            ggml_tensor ** ptr = model.tensors[name];
-            // printf("name %s ptr %p\n", name.c_str(), *ptr);
-            // int k;
-            // scanf("%d", &k);
-            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
-
-            auto tensor = (ggml_tensor *)*model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-            
-
-            // for debugging
-            if (1) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            std::streampos offset = fin.tellg();
-            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
-            // if ( endsWith(name.c_str(), "weight_transpose")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i+4096]);
-            //     }
-            // }
-            // printf("\n");
-            // if (endsWith(name.c_str(), "weight_h20")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i]);
-
-            //     }
-            //     int k;
-            //     scanf("%d", &k);
-            // }
-
-            // // GPT-2 models share the WTE tensor as the LM head
-            // if (name == "model/wte" && has_lm_head == false) {
-            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            // }
-
-            // if (name == "model/lm_head") {
-            //     has_lm_head = true;
-            // }
-            if (model_params.low_vram == false) {
-                for (const auto &s : to_gpu)
-                {
-                    // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight")))
-                    // {
-                    //     std::regex pattern(R"(\d+)");
-                    //     std::smatch match;
-                    //     int layer_id = 0;
-                    //     if (std::regex_search(name, match, pattern))
-                    //     {
-                    //         std::string digitStr = match.str();
-                    //         int num = std::stoi(digitStr);
-                    //         layer_id = num;
-                    //     }
-                    //     printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                    //     if (layer_id > model_params.n_gpu_layers)
-                    //         break;
-                    // }
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-            } else {
-                for (const auto &s : to_gpu_lv)
-                {
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        std::regex pattern(R"(\d+)");
-                        std::smatch match;
-                        int layer_id = 0;
-                        if (std::regex_search(name, match, pattern))
-                        {
-                            std::string digitStr = match.str();
-                            int num = std::stoi(digitStr);
-                            layer_id = num;
-                        }
-                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                        if (layer_id > model_params.n_gpu_layers)
-                            break;
-                        // printf("name %s\n", name.c_str());
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-
-            }
-            if (tensor->backend == GGML_BACKEND_GPU) {
-                #if defined(GGML_USE_CUBLAS)
-                ggml_cuda_transform_tensor(tensor->data, tensor);
-                #endif
-            }
-            for (const auto &s : to_lock)
-            {
-                if (std::regex_match(name, std::regex(s)))
-                {
-                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
-                        // printf("mlock %s\n", name.c_str());
-                    }
-                    else {
-                        printf("mlock failed %s\n", name.c_str());
-                    }
-                }
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-        ggml_set_no_alloc(ctx, false);
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-    printf("load finish\n");
-    // int k;
-    // scanf("%d", &k);
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
-    // static std::vector<uint8_t> buf(buf_size);
-    static void * buf = ggml_cuda_host_malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i + 2;
-        }
-    }
-    offload_func_t offload_func = opt_nop;
-    offload_func_t offload_func_kq = opt_nop;
-    offload_func_t offload_func_v = opt_nop;
-    offload_func_t offload_func_nr = opt_nop;
-    offload_func_t offload_debug = opt_nop;
-#ifdef GGML_USE_CUBLAS
-    offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
-#endif
-    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // int k; 
-    // scanf("%d", &k); 
-
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(allocr, KQ_scale);
-    if (!ggml_allocr_is_measure(allocr)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-    ggml_set_name(inpL, "inpL_first");
-    // offload_func(inpL);
-
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-            offload_func(cur);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g);
-            offload_func(cur);
-            ggml_set_name(cur, "ln_1_g");
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].ln_1_b);
-            ggml_set_name(cur, "ln_1_b");
-            // offload_func(cur);
-            
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-
-        struct ggml_tensor *k_cpy = nullptr;
-        struct ggml_tensor *v_cpy = nullptr;
-        // self-attention
-        {
-            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
-            offload_func_kq(Qcur);
-            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
-            offload_func_kq(Kcur);
-            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
-            offload_func_kq(Kcur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
-            offload_func_v(Vcur);
-            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
-            offload_func_v(Vcur);
-
-            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-            offload_func_v(Vcur);
-
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                offload_func_kq(k);
-                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
-
-                offload_func_v(v);
-                k_cpy = ggml_cpy(ctx0, Kcur, k);
-                offload_func_kq(k_cpy);
-                ggml_set_name(k_cpy, "k_cpy");
-                v_cpy = ggml_cpy(ctx0, Vcur, v);
-                offload_func_v(v_cpy);
-                ggml_set_name(v_cpy, "v_cpy");
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
-            offload_func_kq(Qcur);
-             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-            offload_func_kq(Q);
-
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            // struct ggml_tensor * K =
-            //     ggml_permute(ctx0,
-            //             ggml_reshape_3d(ctx0,
-            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-            //                 n_embd/n_head, n_head, n_past + N),
-            //             0, 2, 1, 3);
-            
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, model.memory_k,
-                        128, n_past + N, n_head,
-                        ggml_element_size(model.memory_k)*n_embd,
-                        ggml_element_size(model.memory_k)*128,
-                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
-            K->src[1] = k_cpy;
-            offload_func_kq(K);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        KQ_scale);
-            offload_func_kq(KQ_scaled);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            offload_func_kq(KQ_masked);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, 128, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*128,
-                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
-            V->src[1] = v_cpy;
-            offload_func_v(V);
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-            ggml_set_name(cur, "KQV_merge_cont");
-            offload_func_v(cur);
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-            ggml_set_name(cur, "attn_proj");
-            offload_func(cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-            ggml_set_name(cur, "attn_bias");
-            offload_func(cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
-        ggml_set_name(cur, "after attn");
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            ggml_tensor *idx = nullptr;
-            ggml_tensor *idx_g = nullptr;
-            ggml_tensor *cur_c = nullptr;
-            
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN");
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN_g");
-                cur = ggml_add(ctx0,
-                        cur, 
-                        model.layers[il].ln_2_b);
-                // offload_func(cur);
-                // ggml_set_name(cur, "norm_FFN_w");
-                // cur_c = ggml_dup(ctx0, cur);
-            }
-            // if (N == 1)
-            if (1)
-            {
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w1_w,
-                                   inpFF);
-                offload_func(idx);
-                ggml_set_name(idx, "mlp_pre_w1");
-                idx = ggml_relu(ctx0, idx);
-                offload_func(idx);
-                ggml_set_name(idx, "relu_pre");
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w2_w,
-                                   idx);
-                ggml_set_name(idx, "mlp_pre_w2");
-                // offload_func(idx);
-                // idx = ggml_sigmoid(ctx0, idx);
-                // offload_func(idx);
-                // idx_g = idx;
-                // idx = ggml_dup(ctx0, idx_g);
-                // ggml_set_name(idx, "idx_cpu_dup");
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            if (N >= 80)
-            // if (0)
-            {
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_fc_w,
-                                   cur);
-                offload_debug(cur);
-                offload_func(cur);
-                ggml_set_name(cur, "up_ffn");
-                cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-                offload_debug(cur);
-                offload_func(cur);
-            }
-            else 
-            {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_fc_w,
-                //                    cur);
-                // offload_func(cur);
-                // cur = ggml_add(ctx0,
-                //     cur,
-                //     model.layers[il].c_mlp_fc_b);
-                // offload_func(cur);
-
-                
-                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
-                model.layers[il].c_mlp_fc_w_gpu,
-                cur,
-                idx,
-                model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "mlp_up_gpu");
-                offload_func(tmp);
-                offload_debug(tmp);
-                cur = ggml_mul_mat_idx(ctx0,
-                                       model.layers[il].c_mlp_fc_w,
-                                       cur,
-                                       idx,
-                                       model.layers[il].gpu_idx);
-                ggml_set_name(cur, "mlp_up_cpu");
-                cur = ggml_add_idx(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b,
-                    idx);
-                ggml_set_name(tmp, "mlp_up_bias");
-                offload_debug(tmp);
-                offload_func(tmp);
-
-            cur = ggml_add(ctx0, cur, tmp);
-            ggml_set_name(cur, "mlp_up_mix");
-            offload_func(cur);
-
-                // cur = tmp;
-
-            }
-
-            
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_relu(ctx0, cur);
-            // cur_c = cur;
-            // offload_func(cur);
-            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            if (N >= 80) {
-            // if (0) { 
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_proj_w,
-                //                    cur);
-                cur = ggml_axpy(ctx0,
-                                   model.layers[il].c_mlp_proj_w_t,
-                                   cur,
-                                   NULL,
-                                   NULL);
-                offload_debug(cur);
-                offload_func(cur);
-                ggml_set_name(cur, "down_ffn");
-
-                cur = ggml_add(ctx0,
-                               cur,
-                               model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-                offload_debug(cur);
-            }
-            else {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_proj_w,
-                //                    cur);
-                // offload_func(cur);
-                
-                // cur = ggml_axpy(ctx0, 
-                // model.layers[il].c_mlp_proj_w_t,
-                // cur,
-                // NULL,
-                // NULL);
-                // offload_func(cur);
-
-
-                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
-                // model.layers[il].c_mlp_proj_w_gpu,
-                // cur,
-                // model.layers[il].gpu_bucket,
-                // NULL);
-                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
-                    model.layers[il].c_mlp_proj_w_gpu,
-                    cur,
-                    idx,
-                    model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "axpy");
-                offload_func(tmp);
-                offload_debug(tmp);
-                cur = ggml_axpy(ctx0, 
-                model.layers[il].c_mlp_proj_w_t,
-                cur_c,
-                idx,
-                model.layers[il].gpu_idx);
-
-                cur = ggml_add(ctx0, cur, tmp);
-                offload_func(cur);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-                
-                // tmp = ggml_add(ctx0,
-                //                tmp,
-                //                model.layers[il].c_mlp_proj_b);
-                // offload_func(tmp);
-                // offload_debug(tmp);
-
-                // cur = tmp;
-            }
-            
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-        offload_func(inpL);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-        offload_func_nr(inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g);
-        offload_func_nr(inpL);
-        inpL = ggml_add(ctx0,
-                inpL,
-                model.ln_f_b);
-        ggml_set_name(inpL, "before");
-        offload_func_nr(inpL);
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-    ggml_set_name(inpL, "last_layer");
-// offload_func(inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-#ifdef GGML_USE_CUBLAS
-    for (int i = 0; i < gf->n_leafs; i++) {
-        ggml_tensor * node = gf->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-#endif
-
-
-
-    // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, "hello world");
-    }
-    printf("load finish\n");
-
-    // keep this buffer alive while evaluating the model
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        // compute_buffer.resize(mem_size);
-        compute_buffer = ggml_cuda_host_malloc(mem_size);
-        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
-        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    int cnt = 0;
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-            cnt += 1;
-
-            if (cnt > 0)
-                t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/gpt-2-sparse/main.cpp_bak b/examples/gpt-2-sparse/main.cpp_bak
deleted file mode 100644
index e1e9d58e..00000000
--- a/examples/gpt-2-sparse/main.cpp_bak
+++ /dev/null
@@ -1,1546 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include <regex>
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include "ggml-cuda.h"
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
-    (void) tensor;
-}
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    // struct ggml_tensor * c_attn_attn_w;
-    // struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_attn_q_w;
-    struct ggml_tensor * c_attn_attn_q_b;
-
-    struct ggml_tensor * c_attn_attn_k_w;
-    struct ggml_tensor * c_attn_attn_k_b;
-
-    struct ggml_tensor * c_attn_attn_v_w;
-    struct ggml_tensor * c_attn_attn_v_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-
-    struct ggml_tensor * gpu_idx;
-    struct ggml_tensor * gpu_bucket;
-    // gpu heat
-    struct ggml_tensor * c_mlp_fc_w_gpu;
-    struct ggml_tensor * c_mlp_proj_w_t;
-    struct ggml_tensor * c_mlp_proj_w_gpu;
-
-    //predictor
-    struct ggml_tensor * mlp_pre_w1_w;
-    struct ggml_tensor * mlp_pre_w2_w;
-};
-
-struct opt_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    opt_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error("opt_file fail\n");
-		}
-		seek(0, SEEK_END);
-		size = tell();
-		seek(0, SEEK_SET);
-    }
-	size_t tell() const {
-#ifdef _WIN32
-		__int64 ret = _ftelli64(fp);
-#else
-		long ret = std::ftell(fp);
-#endif
-		GGML_ASSERT(ret != -1); // this really shouldn't fail
-		return (size_t) ret;
-	}
-
-	void seek(size_t offset, int whence) {
-#ifdef _WIN32
-		int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-		int ret = std::fseek(fp, (long) offset, whence);
-#endif
-		GGML_ASSERT(ret == 0); // same
-	}
-
-    ~opt_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-#define _POSIX_MAPPED_FILES
-#include <sys/types.h>
-#include <sys/mman.h>
-
-struct opt_mmap {
-    void * addr;
-    size_t size;
-
-    opt_mmap(const opt_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
-#ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error("mmap failed\n");
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~opt_mmap() {
-        munmap(addr, size);
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
-        (void) prefetch;
-        (void) numa;
-
-        throw std::runtime_error(std::string("mmap not supported"));
-    }
-#endif
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-    struct opt_file * file;
-    struct opt_mmap * mapping;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor **> tensors;
-};
-
-struct ggml_context * ctx0 = nullptr;
-// std::vector<uint8_t> compute_buffer;
-void *compute_buffer;
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-    model.file = new opt_file(fname.c_str(), "rb");
-    printf("size %d\n", model.file->size);
-    model.mapping = new opt_mmap(model.file, 0, false);
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        /* int32_t n_vocab = 0; */
-        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
-
-        /* if (n_vocab != model.hparams.n_vocab) { */
-        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
-        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
-        /*     return false; */
-        /* } */
-        int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-    printf("wtype %d\n", wtype);
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        //need refactor
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
-        //predictor
-        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size = 0;
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*51200; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-    int main_gpu = 0;
-#if defined(GGML_USE_CUBLAS)
-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
-    ggml_cuda_set_main_device(main_gpu);
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
-#else
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
-#endif
-    
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
-        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
-
-        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
-        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        
-        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
-
-        // map by name
-        model.tensors["output_norm.weight"] = &model.ln_f_g;
-        model.tensors["output_norm.bias"] = &model.ln_f_b;
-
-        model.tensors["tok_embeddings.weight"]     = &model.wte;
-        model.tensors["pos_embeddings.weight"]     = &model.wpe;
-        model.tensors["output.weight"] = &model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-            memset(&layer, 0, sizeof(gpt2_layer));
-
-        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-        //     // need refine
-        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
-        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
-        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
-
-        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
-        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
-
-        //     if (i <= 10) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
-        //     } else if (i <= 12) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
-        //     } else if (i <= 18) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
-
-        //     } else if (i <= 21) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
-        //     } else if (i <= 26) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
-        //     } else if (i <= 31) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
-        //     }
-
-        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
-
-        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
-
-            // map by name
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
-
-            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
-            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
-            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
-            
-            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
-            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
-        }
-    }
-
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        #ifdef GGML_USE_CUBLAS
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
-        #endif
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-    ggml_set_no_alloc(ctx, true);
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-        const std::vector<std::string> to_gpu = {
-                "output_norm.bias",
-                "output_norm.weight",
-                // ".*attention.wq.weight",
-                // ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                // ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                // ".*feed_forward.w1.weight$",
-                ".*feed_forward.w1.bias",
-                // ".*feed_forward.w2.weight_h20$",
-                ".*feed_forward.w2.weight_transpose",
-                // ".*feed_forward.w2.weight$",
-                /* ".*feed_forward.w2.weight$", */
-                ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                "layers.*output_norm.weight",
-                "layers.*output_norm.bias",
-                ".*fc1.weight",
-                ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_gpu_lv = {
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                // "layers.*output_norm.weight",
-                // "layers.*output_norm.bias",
-                // ".*fc1.weight",
-                // ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_lock = {
-                "tok_embeddings.weight",
-                "pos_embeddings.weight",
-                // "output_norm.bias",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                // ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                // ".*attention.wo.bias",
-                ".*feed_forward.w1.weight",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_transpose",
-                // ".*feed_forward.w2.weight",
-                ".*feed_forward.w2.bias",
-                ".*gpu.weight",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                ".*output_norm.weight",
-                ".*output_norm.bias",
-                ".*attention.*fc1.weight",
-                ".*attention.*fc1.bias",
-                ".*attention.*fc2.weight",
-                ".*attention.*fc2.bias",
-                // ".*w2.bias",
-                // ".*w1.bias",
-                "output.weight",
-            };
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            int64_t new_ne[2];
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-                new_ne[i] = ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-            ggml_tensor ** ptr = model.tensors[name];
-            // printf("name %s ptr %p\n", name.c_str(), *ptr);
-            // int k;
-            // scanf("%d", &k);
-            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
-
-            auto tensor = (ggml_tensor *)*model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            std::streampos offset = fin.tellg();
-            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
-
-            // // GPT-2 models share the WTE tensor as the LM head
-            // if (name == "model/wte" && has_lm_head == false) {
-            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            // }
-
-            // if (name == "model/lm_head") {
-            //     has_lm_head = true;
-            // }
-            if (model_params.low_vram == false) {
-                for (const auto &s : to_gpu)
-                {
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-            } else {
-                for (const auto &s : to_gpu_lv)
-                {
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        std::regex pattern(R"(\d+)");
-                        std::smatch match;
-                        int layer_id = 0;
-                        if (std::regex_search(name, match, pattern))
-                        {
-                            std::string digitStr = match.str();
-                            int num = std::stoi(digitStr);
-                            layer_id = num;
-                        }
-                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                        if (layer_id > model_params.n_gpu_layers)
-                            break;
-                        // printf("name %s\n", name.c_str());
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-
-            }
-            if (tensor->backend == GGML_BACKEND_GPU) {
-                #if defined(GGML_USE_CUBLAS)
-                ggml_cuda_transform_tensor(tensor->data, tensor);
-                #endif
-            }
-            for (const auto &s : to_lock)
-            {
-                if (std::regex_match(name, std::regex(s)))
-                {
-                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
-                        // printf("mlock %s\n", name.c_str());
-                    }
-                    else {
-                        printf("mlock failed %s\n", name.c_str());
-                    }
-                }
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-        ggml_set_no_alloc(ctx, false);
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
-    // static std::vector<uint8_t> buf(buf_size);
-    static void * buf = ggml_cuda_host_malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i + 2;
-        }
-    }
-    offload_func_t offload_func = opt_nop;
-    offload_func_t offload_func_kq = opt_nop;
-    offload_func_t offload_func_v = opt_nop;
-    offload_func_t offload_func_nr = opt_nop;
-    offload_func_t offload_debug = opt_nop;
-#ifdef GGML_USE_CUBLAS
-    // offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
-#endif
-    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // int k; 
-    // scanf("%d", &k); 
-
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(allocr, KQ_scale);
-    if (!ggml_allocr_is_measure(allocr)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-    ggml_set_name(inpL, "inpL_first");
-    // offload_func(inpL);
-
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-            offload_func(cur);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g);
-            offload_func(cur);
-            ggml_set_name(cur, "ln_1_g");
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].ln_1_b);
-            ggml_set_name(cur, "ln_1_b");
-            // offload_func(cur);
-            
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-
-        struct ggml_tensor *k_cpy = nullptr;
-        struct ggml_tensor *v_cpy = nullptr;
-        // self-attention
-        {
-            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
-            offload_func_kq(Qcur);
-            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
-            offload_func_kq(Kcur);
-            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
-            offload_func_kq(Kcur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
-            offload_func_v(Vcur);
-            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
-            offload_func_v(Vcur);
-
-            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-            offload_func_v(Vcur);
-
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                offload_func_kq(k);
-                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
-
-                offload_func_v(v);
-                k_cpy = ggml_cpy(ctx0, Kcur, k);
-                offload_func_kq(k_cpy);
-                ggml_set_name(k_cpy, "k_cpy");
-                v_cpy = ggml_cpy(ctx0, Vcur, v);
-                offload_func_v(v_cpy);
-                ggml_set_name(v_cpy, "v_cpy");
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
-            offload_func_kq(Qcur);
-             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-            offload_func_kq(Q);
-
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            // struct ggml_tensor * K =
-            //     ggml_permute(ctx0,
-            //             ggml_reshape_3d(ctx0,
-            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-            //                 n_embd/n_head, n_head, n_past + N),
-            //             0, 2, 1, 3);
-            
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, model.memory_k,
-                        128, n_past + N, n_head,
-                        ggml_element_size(model.memory_k)*n_embd,
-                        ggml_element_size(model.memory_k)*128,
-                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
-            K->src[1] = k_cpy;
-            offload_func_kq(K);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        KQ_scale);
-            offload_func_kq(KQ_scaled);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            offload_func_kq(KQ_masked);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, 128, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*128,
-                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
-            V->src[1] = v_cpy;
-            offload_func_v(V);
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-            ggml_set_name(cur, "KQV_merge_cont");
-            offload_func_v(cur);
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-            ggml_set_name(cur, "attn_proj");
-            offload_func(cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-            ggml_set_name(cur, "attn_bias");
-            offload_func(cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
-        ggml_set_name(cur, "after attn");
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            ggml_tensor *idx = nullptr;
-            ggml_tensor *idx_g = nullptr;
-            ggml_tensor *cur_c = nullptr;
-            
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN");
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN_g");
-                cur = ggml_add(ctx0,
-                        cur, 
-                        model.layers[il].ln_2_b);
-                // offload_func(cur);
-                // ggml_set_name(cur, "norm_FFN_w");
-                // cur_c = ggml_dup(ctx0, cur);
-            }
-            // if (N == 1)
-            if (1)
-            {
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w1_w,
-                                   inpFF);
-                offload_func(idx);
-                ggml_set_name(idx, "mlp_pre_w1");
-                idx = ggml_relu(ctx0, idx);
-                offload_func(idx);
-                ggml_set_name(idx, "relu_pre");
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w2_w,
-                                   idx);
-                ggml_set_name(idx, "mlp_pre_w2");
-                // offload_func(idx);
-                // idx = ggml_sigmoid(ctx0, idx);
-                // offload_func(idx);
-                // idx_g = idx;
-                // idx = ggml_dup(ctx0, idx_g);
-                // ggml_set_name(idx, "idx_cpu_dup");
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            if (N != 1)
-            // if (0)
-            {
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_fc_w,
-                                   cur);
-                offload_func(cur);
-                ggml_set_name(cur, "up_ffn");
-                cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-                offload_func(cur);
-            }
-            else 
-            {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_fc_w,
-                //                    cur);
-                // offload_func(cur);
-                // cur = ggml_add(ctx0,
-                //     cur,
-                //     model.layers[il].c_mlp_fc_b);
-                // offload_func(cur);
-
-                
-                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
-                model.layers[il].c_mlp_fc_w_gpu,
-                // model.layers[il].c_mlp_fc_w,
-                cur,
-                idx,
-                model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "mlp_up_gpu");
-                offload_func(tmp);
-                offload_debug(tmp);
-                cur = ggml_mul_mat_idx(ctx0,
-                                       model.layers[il].c_mlp_fc_w,
-                                       cur,
-                                       idx,
-                                       model.layers[il].gpu_idx);
-                ggml_set_name(cur, "mlp_up_cpu");
-
-                // cur = ggml_add_idx(ctx0,
-                //     cur,
-                //     model.layers[il].c_mlp_fc_b,
-                //     idx);
-                // offload_func(cur);
-                tmp = ggml_add_idx(ctx0,
-                                tmp,
-                                model.layers[il].c_mlp_fc_b,
-                                idx);
-                offload_debug(tmp);
-
-
-            // cur = ggml_add(ctx0, cur, tmp);
-            // ggml_set_name(cur, "mlp_up_mix");
-            // offload_func(cur);
-
-                // cur = tmp;
-
-            }
-
-            
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_relu(ctx0, cur);
-            // cur_c = cur;
-            // offload_func(cur);
-            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            // if (N != 1) {
-            if (0) { 
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_proj_w,
-                                   cur);
-                offload_func(cur);
-                ggml_set_name(cur, "down_ffn");
-
-                cur = ggml_add(ctx0,
-                               cur,
-                               model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-            }
-            else {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_proj_w,
-                //                    cur);
-                // offload_func(cur);
-                
-                // cur = ggml_axpy(ctx0, 
-                // model.layers[il].c_mlp_proj_w_t,
-                // cur,
-                // NULL,
-                // NULL);
-                // offload_func(cur);
-
-
-                //here
-                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
-                // model.layers[il].c_mlp_proj_w_gpu,
-                // cur,
-                // model.layers[il].gpu_bucket,
-                // NULL);
-                // ggml_set_name(tmp, "axpy");
-                // offload_func(tmp);
-                // offload_debug(tmp);
-                cur = ggml_axpy(ctx0, 
-                model.layers[il].c_mlp_proj_w_t,
-                cur_c,
-                // NULL,
-                // NULL);
-                idx,
-                model.layers[il].gpu_bucket);
-                // model.layers[il].gpu_idx);
-                // offload_func(cur);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-                
-                // tmp = ggml_add(ctx0,
-                //                tmp,
-                //                model.layers[il].c_mlp_proj_b);
-                // offload_func(tmp);
-                // offload_debug(tmp);
-
-                // cur = ggml_add(ctx0, cur, tmp);
-                // offload_func(cur);
-            }
-            
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-        offload_func(inpL);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-        offload_func_nr(inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g);
-        offload_func_nr(inpL);
-        inpL = ggml_add(ctx0,
-                inpL,
-                model.ln_f_b);
-        ggml_set_name(inpL, "before");
-        offload_func_nr(inpL);
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-    ggml_set_name(inpL, "last_layer");
-// offload_func(inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-#ifdef GGML_USE_CUBLAS
-    for (int i = 0; i < gf->n_leafs; i++) {
-        ggml_tensor * node = gf->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-#endif
-
-
-
-    // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, "hello world");
-    }
-    printf("load finish\n");
-
-    // keep this buffer alive while evaluating the model
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        // compute_buffer.resize(mem_size);
-        compute_buffer = ggml_cuda_host_malloc(mem_size);
-        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
-        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    int cnt = 0;
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-            cnt += 1;
-
-            if (cnt > 0)
-                t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/gpt-2-sparse/main13b.cpp b/examples/gpt-2-sparse/main13b.cpp
deleted file mode 100644
index 0681da3e..00000000
--- a/examples/gpt-2-sparse/main13b.cpp
+++ /dev/null
@@ -1,1583 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include <regex>
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include "ggml-cuda.h"
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
-    (void) tensor;
-}
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    // struct ggml_tensor * c_attn_attn_w;
-    // struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_attn_q_w;
-    struct ggml_tensor * c_attn_attn_q_b;
-
-    struct ggml_tensor * c_attn_attn_k_w;
-    struct ggml_tensor * c_attn_attn_k_b;
-
-    struct ggml_tensor * c_attn_attn_v_w;
-    struct ggml_tensor * c_attn_attn_v_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-
-    struct ggml_tensor * gpu_idx;
-    struct ggml_tensor * gpu_bucket;
-    // gpu heat
-    struct ggml_tensor * c_mlp_fc_w_gpu;
-    struct ggml_tensor * c_mlp_proj_w_t;
-    struct ggml_tensor * c_mlp_proj_w_gpu;
-
-    //predictor
-    struct ggml_tensor * mlp_pre_w1_w;
-    struct ggml_tensor * mlp_pre_w2_w;
-};
-
-struct opt_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    opt_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error("opt_file fail\n");
-		}
-		seek(0, SEEK_END);
-		size = tell();
-		seek(0, SEEK_SET);
-    }
-	size_t tell() const {
-#ifdef _WIN32
-		__int64 ret = _ftelli64(fp);
-#else
-		long ret = std::ftell(fp);
-#endif
-		GGML_ASSERT(ret != -1); // this really shouldn't fail
-		return (size_t) ret;
-	}
-
-	void seek(size_t offset, int whence) {
-#ifdef _WIN32
-		int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-		int ret = std::fseek(fp, (long) offset, whence);
-#endif
-		GGML_ASSERT(ret == 0); // same
-	}
-
-    ~opt_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-#define _POSIX_MAPPED_FILES
-#include <sys/types.h>
-#include <sys/mman.h>
-
-struct opt_mmap {
-    void * addr;
-    size_t size;
-
-    opt_mmap(const opt_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
-#ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error("mmap failed\n");
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~opt_mmap() {
-        munmap(addr, size);
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
-        (void) prefetch;
-        (void) numa;
-
-        throw std::runtime_error(std::string("mmap not supported"));
-    }
-#endif
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-    struct opt_file * file;
-    struct opt_mmap * mapping;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor **> tensors;
-};
-
-struct ggml_context * ctx0 = nullptr;
-// std::vector<uint8_t> compute_buffer;
-void *compute_buffer;
-
-bool endsWith(const std::string& str, const std::string& suffix) {
-    if (str.length() < suffix.length()) {
-        return false;
-    }
-    return str.substr(str.length() - suffix.length()) == suffix;
-}
-
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-    model.file = new opt_file(fname.c_str(), "rb");
-    printf("size %d\n", model.file->size);
-    model.mapping = new opt_mmap(model.file, 0, false);
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        /* int32_t n_vocab = 0; */
-        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
-
-        /* if (n_vocab != model.hparams.n_vocab) { */
-        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
-        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
-        /*     return false; */
-        /* } */
-        int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-    printf("wtype %d\n", wtype);
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        //need refactor
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
-        //predictor
-        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size = 0;
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*51200; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-    int main_gpu = 0;
-#if defined(GGML_USE_CUBLAS)
-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
-    ggml_cuda_set_main_device(main_gpu);
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
-#else
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
-#endif
-    
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
-        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
-
-        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
-        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        
-        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
-
-        // map by name
-        model.tensors["output_norm.weight"] = &model.ln_f_g;
-        model.tensors["output_norm.bias"] = &model.ln_f_b;
-
-        model.tensors["tok_embeddings.weight"]     = &model.wte;
-        model.tensors["pos_embeddings.weight"]     = &model.wpe;
-        model.tensors["output.weight"] = &model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-            memset(&layer, 0, sizeof(gpt2_layer));
-
-        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-        //     // need refine
-        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
-        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
-        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
-
-        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
-        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
-
-        //     if (i <= 10) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
-        //     } else if (i <= 12) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
-        //     } else if (i <= 18) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
-
-        //     } else if (i <= 21) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
-        //     } else if (i <= 26) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
-        //     } else if (i <= 31) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
-        //     }
-
-        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
-
-        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
-
-            // map by name
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
-
-            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
-            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
-            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
-            
-            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
-            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
-        }
-    }
-
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        #ifdef GGML_USE_CUBLAS
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
-            // ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
-        #endif
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-    ggml_set_no_alloc(ctx, true);
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-        const std::vector<std::string> to_gpu = {
-                "output_norm.bias",
-                "output_norm.weight",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                // ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                "layers.*output_norm.weight",
-                "layers.*output_norm.bias",
-                ".*fc1.weight",
-                ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_gpu_lv = {
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                // "layers.*output_norm.weight",
-                // "layers.*output_norm.bias",
-                // ".*fc1.weight",
-                // ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_lock = {
-                "tok_embeddings.weight",
-                "pos_embeddings.weight",
-                // "output_norm.bias",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                // ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                // ".*attention.wo.bias",
-                ".*feed_forward.w1.weight",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_transpose",
-                // ".*feed_forward.w2.weight",
-                ".*feed_forward.w2.bias",
-                ".*gpu.weight",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                ".*output_norm.weight",
-                ".*output_norm.bias",
-                ".*attention.*fc1.weight",
-                ".*attention.*fc1.bias",
-                ".*attention.*fc2.weight",
-                ".*attention.*fc2.bias",
-                // ".*w2.bias",
-                // ".*w1.bias",
-                "output.weight",
-            };
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            int64_t new_ne[2];
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-                new_ne[i] = ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-            ggml_tensor ** ptr = model.tensors[name];
-            // printf("name %s ptr %p\n", name.c_str(), *ptr);
-            // int k;
-            // scanf("%d", &k);
-            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
-
-            auto tensor = (ggml_tensor *)*model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-            
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            std::streampos offset = fin.tellg();
-            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
-            // if ( endsWith(name.c_str(), "weight_transpose")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i+4096]);
-            //     }
-            // }
-            // printf("\n");
-            // if (endsWith(name.c_str(), "weight_h20")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i]);
-
-            //     }
-            //     int k;
-            //     scanf("%d", &k);
-            // }
-
-            // // GPT-2 models share the WTE tensor as the LM head
-            // if (name == "model/wte" && has_lm_head == false) {
-            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            // }
-
-            // if (name == "model/lm_head") {
-            //     has_lm_head = true;
-            // }
-            if (model_params.low_vram == false) {
-                for (const auto &s : to_gpu)
-                {
-                    // if (std::regex_search(name, std::regex(".*fc1.weight")) || std::regex_search(name, std::regex(".*fc2.weight")))
-                    // {
-                    //     std::regex pattern(R"(\d+)");
-                    //     std::smatch match;
-                    //     int layer_id = 0;
-                    //     if (std::regex_search(name, match, pattern))
-                    //     {
-                    //         std::string digitStr = match.str();
-                    //         int num = std::stoi(digitStr);
-                    //         layer_id = num;
-                    //     }
-                    //     printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                    //     if (layer_id > model_params.n_gpu_layers)
-                    //         break;
-                    // }
-                    // printf("name %s\n", name.c_str());
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-            } else {
-                for (const auto &s : to_gpu_lv)
-                {
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        std::regex pattern(R"(\d+)");
-                        std::smatch match;
-                        int layer_id = 0;
-                        if (std::regex_search(name, match, pattern))
-                        {
-                            std::string digitStr = match.str();
-                            int num = std::stoi(digitStr);
-                            layer_id = num;
-                        }
-                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                        if (layer_id > model_params.n_gpu_layers)
-                            break;
-                        // printf("name %s\n", name.c_str());
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-
-            }
-            if (tensor->backend == GGML_BACKEND_GPU) {
-                #if defined(GGML_USE_CUBLAS)
-                ggml_cuda_transform_tensor(tensor->data, tensor);
-                #endif
-            }
-            for (const auto &s : to_lock)
-            {
-                if (std::regex_match(name, std::regex(s)))
-                {
-                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
-                        // printf("mlock %s\n", name.c_str());
-                    }
-                    else {
-                        printf("mlock failed %s\n", name.c_str());
-                    }
-                }
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-        ggml_set_no_alloc(ctx, false);
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
-    // static std::vector<uint8_t> buf(buf_size);
-    static void * buf = ggml_cuda_host_malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i + 2;
-        }
-    }
-    offload_func_t offload_func = opt_nop;
-    offload_func_t offload_func_kq = opt_nop;
-    offload_func_t offload_func_v = opt_nop;
-    offload_func_t offload_func_nr = opt_nop;
-    offload_func_t offload_debug = opt_nop;
-#ifdef GGML_USE_CUBLAS
-    offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // offload_func = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
-    // offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
-#endif
-    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // int k; 
-    // scanf("%d", &k); 
-
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(allocr, KQ_scale);
-    if (!ggml_allocr_is_measure(allocr)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-    ggml_set_name(inpL, "inpL_first");
-    // offload_func(inpL);
-
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-            offload_func(cur);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g);
-            offload_func(cur);
-            ggml_set_name(cur, "ln_1_g");
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].ln_1_b);
-            ggml_set_name(cur, "ln_1_b");
-            // offload_func(cur);
-            
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-
-        struct ggml_tensor *k_cpy = nullptr;
-        struct ggml_tensor *v_cpy = nullptr;
-        // self-attention
-        {
-            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
-            offload_func_kq(Qcur);
-            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
-            offload_func_kq(Kcur);
-            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
-            offload_func_kq(Kcur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
-            offload_func_v(Vcur);
-            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
-            offload_func_v(Vcur);
-
-            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-            offload_func_v(Vcur);
-
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                offload_func_kq(k);
-                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
-
-                offload_func_v(v);
-                k_cpy = ggml_cpy(ctx0, Kcur, k);
-                offload_func_kq(k_cpy);
-                ggml_set_name(k_cpy, "k_cpy");
-                v_cpy = ggml_cpy(ctx0, Vcur, v);
-                offload_func_v(v_cpy);
-                ggml_set_name(v_cpy, "v_cpy");
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
-            offload_func_kq(Qcur);
-             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-            offload_func_kq(Q);
-
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            // struct ggml_tensor * K =
-            //     ggml_permute(ctx0,
-            //             ggml_reshape_3d(ctx0,
-            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-            //                 n_embd/n_head, n_head, n_past + N),
-            //             0, 2, 1, 3);
-            
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, model.memory_k,
-                        128, n_past + N, n_head,
-                        ggml_element_size(model.memory_k)*n_embd,
-                        ggml_element_size(model.memory_k)*128,
-                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
-            K->src[1] = k_cpy;
-            offload_func_kq(K);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        KQ_scale);
-            offload_func_kq(KQ_scaled);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            offload_func_kq(KQ_masked);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, 128, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*128,
-                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
-            V->src[1] = v_cpy;
-            offload_func_v(V);
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-            ggml_set_name(cur, "KQV_merge_cont");
-            offload_func_v(cur);
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-            ggml_set_name(cur, "attn_proj");
-            offload_func(cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-            ggml_set_name(cur, "attn_bias");
-            offload_func(cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
-        ggml_set_name(cur, "after attn");
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            ggml_tensor *idx = nullptr;
-            ggml_tensor *idx_g = nullptr;
-            ggml_tensor *cur_c = nullptr;
-            
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN");
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN_g");
-                cur = ggml_add(ctx0,
-                        cur, 
-                        model.layers[il].ln_2_b);
-                // offload_func(cur);
-                // ggml_set_name(cur, "norm_FFN_w");
-                // cur_c = ggml_dup(ctx0, cur);
-            }
-            // if (N == 1)
-            if (1)
-            {
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w1_w,
-                                   cur);
-                offload_func(idx);
-                ggml_set_name(idx, "mlp_pre_w1");
-                idx = ggml_relu(ctx0, idx);
-                offload_func(idx);
-                ggml_set_name(idx, "relu_pre");
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w2_w,
-                                   idx);
-                ggml_set_name(idx, "mlp_pre_w2");
-                // offload_func(idx);
-                // idx = ggml_sigmoid(ctx0, idx);
-                // offload_func(idx);
-                // idx_g = idx;
-                // idx = ggml_dup(ctx0, idx_g);
-                // ggml_set_name(idx, "idx_cpu_dup");
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            // if (N != 1)
-            if (0)
-            {
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_fc_w,
-                                   cur);
-                offload_func(cur);
-                ggml_set_name(cur, "up_ffn");
-                cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-                offload_func(cur);
-            }
-            else 
-            {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_fc_w,
-                //                    cur);
-                // offload_func(cur);
-                // cur = ggml_add(ctx0,
-                //     cur,
-                //     model.layers[il].c_mlp_fc_b);
-                // offload_func(cur);
-
-                
-                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
-                model.layers[il].c_mlp_fc_w_gpu,
-                cur,
-                idx,
-                model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "mlp_up_gpu");
-                offload_func(tmp);
-                offload_debug(tmp);
-                cur = ggml_mul_mat_idx(ctx0,
-                                       model.layers[il].c_mlp_fc_w,
-                                       cur,
-                                       idx,
-                                       model.layers[il].gpu_idx);
-                ggml_set_name(cur, "mlp_up_cpu");
-                tmp = ggml_add_idx(ctx0,
-                    tmp,
-                    model.layers[il].c_mlp_fc_b,
-                    idx);
-                ggml_set_name(tmp, "mlp_up_bias");
-                offload_debug(tmp);
-                offload_func(tmp);
-
-            cur = ggml_add(ctx0, cur, tmp);
-            ggml_set_name(cur, "mlp_up_mix");
-            offload_func(cur);
-
-                // cur = tmp;
-
-            }
-
-            
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_relu(ctx0, cur);
-            // cur_c = cur;
-            // offload_func(cur);
-            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            // if (N != 1) {
-            if (0) { 
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_proj_w,
-                                   cur);
-                offload_func(cur);
-                ggml_set_name(cur, "down_ffn");
-
-                cur = ggml_add(ctx0,
-                               cur,
-                               model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-            }
-            else {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_proj_w,
-                //                    cur);
-                // offload_func(cur);
-                
-                // cur = ggml_axpy(ctx0, 
-                // model.layers[il].c_mlp_proj_w_t,
-                // cur,
-                // NULL,
-                // NULL);
-                // offload_func(cur);
-
-
-                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
-                // model.layers[il].c_mlp_proj_w_gpu,
-                // cur,
-                // model.layers[il].gpu_bucket,
-                // NULL);
-                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
-                    model.layers[il].c_mlp_proj_w_gpu,
-                    cur,
-                    idx,
-                    model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "axpy");
-                offload_func(tmp);
-                offload_debug(tmp);
-
-                cur = ggml_axpy(ctx0, 
-                model.layers[il].c_mlp_proj_w_t,
-                cur_c,
-                idx,
-                model.layers[il].gpu_idx);
-
-                cur = ggml_add(ctx0, cur, tmp);
-                offload_func(cur);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-                
-                // tmp = ggml_add(ctx0,
-                //                tmp,
-                //                model.layers[il].c_mlp_proj_b);
-                // offload_func(tmp);
-                // offload_debug(tmp);
-
-                // cur = tmp;
-            }
-            
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-        offload_func(inpL);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-        offload_func_nr(inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g);
-        offload_func_nr(inpL);
-        inpL = ggml_add(ctx0,
-                inpL,
-                model.ln_f_b);
-        ggml_set_name(inpL, "before");
-        offload_func_nr(inpL);
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-    ggml_set_name(inpL, "last_layer");
-// offload_func(inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-#ifdef GGML_USE_CUBLAS
-    for (int i = 0; i < gf->n_leafs; i++) {
-        ggml_tensor * node = gf->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-#endif
-
-
-
-    // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, "hello world");
-    }
-    printf("load finish\n");
-
-    // keep this buffer alive while evaluating the model
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        // compute_buffer.resize(mem_size);
-        compute_buffer = ggml_cuda_host_malloc(mem_size);
-        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
-        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    int cnt = 0;
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-            cnt += 1;
-
-            if (cnt > 0)
-                t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            llama_sampling_params & sparams = params.sparams;
-            const int   top_k = sparams.top_k;
-            const float top_p = sparams.top_p;
-            const float temp  = sparams.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/gpt-2-sparse/main7b.cpp b/examples/gpt-2-sparse/main7b.cpp
deleted file mode 100644
index a07a5472..00000000
--- a/examples/gpt-2-sparse/main7b.cpp
+++ /dev/null
@@ -1,1567 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include <regex>
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include "ggml-cuda.h"
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-void opt_nop(struct ggml_tensor * tensor) { // don't offload by default
-    (void) tensor;
-}
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    // struct ggml_tensor * c_attn_attn_w;
-    // struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_attn_q_w;
-    struct ggml_tensor * c_attn_attn_q_b;
-
-    struct ggml_tensor * c_attn_attn_k_w;
-    struct ggml_tensor * c_attn_attn_k_b;
-
-    struct ggml_tensor * c_attn_attn_v_w;
-    struct ggml_tensor * c_attn_attn_v_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-
-    struct ggml_tensor * gpu_idx;
-    struct ggml_tensor * gpu_bucket;
-    // gpu heat
-    struct ggml_tensor * c_mlp_fc_w_gpu;
-    struct ggml_tensor * c_mlp_proj_w_t;
-    struct ggml_tensor * c_mlp_proj_w_gpu;
-
-    //predictor
-    struct ggml_tensor * mlp_pre_w1_w;
-    struct ggml_tensor * mlp_pre_w2_w;
-};
-
-struct opt_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    opt_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error("opt_file fail\n");
-		}
-		seek(0, SEEK_END);
-		size = tell();
-		seek(0, SEEK_SET);
-    }
-	size_t tell() const {
-#ifdef _WIN32
-		__int64 ret = _ftelli64(fp);
-#else
-		long ret = std::ftell(fp);
-#endif
-		GGML_ASSERT(ret != -1); // this really shouldn't fail
-		return (size_t) ret;
-	}
-
-	void seek(size_t offset, int whence) {
-#ifdef _WIN32
-		int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-		int ret = std::fseek(fp, (long) offset, whence);
-#endif
-		GGML_ASSERT(ret == 0); // same
-	}
-
-    ~opt_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-#define _POSIX_MAPPED_FILES
-#include <sys/types.h>
-#include <sys/mman.h>
-
-struct opt_mmap {
-    void * addr;
-    size_t size;
-
-    opt_mmap(const opt_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    opt_mmap(struct opt_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
-#ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error("mmap failed\n");
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~opt_mmap() {
-        munmap(addr, size);
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    opt_mmap(struct opt_file *, bool prefetch = true, bool numa = false) {
-        (void) prefetch;
-        (void) numa;
-
-        throw std::runtime_error(std::string("mmap not supported"));
-    }
-#endif
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-    struct opt_file * file;
-    struct opt_mmap * mapping;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor **> tensors;
-};
-
-struct ggml_context * ctx0 = nullptr;
-// std::vector<uint8_t> compute_buffer;
-void *compute_buffer;
-
-bool endsWith(const std::string& str, const std::string& suffix) {
-    if (str.length() < suffix.length()) {
-        return false;
-    }
-    return str.substr(str.length() - suffix.length()) == suffix;
-}
-
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, gpt_params model_params) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-    model.file = new opt_file(fname.c_str(), "rb");
-    printf("size %d\n", model.file->size);
-    model.mapping = new opt_mmap(model.file, 0, false);
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        /* int32_t n_vocab = 0; */
-        /* fin.read((char *) &n_vocab, sizeof(n_vocab)); */
-
-        /* if (n_vocab != model.hparams.n_vocab) { */
-        /*     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", */
-        /*             __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); */
-        /*     return false; */
-        /* } */
-        int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-    printf("wtype %d\n", wtype);
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        //need refactor
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_idx
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_I32));          // gpu_bucket
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));         // c_mlp_fc_w_h20
-        ctx_size += n_layer*(4096*n_embd*4*ggml_type_sizef(wtype));
-        //predictor
-        ctx_size += n_layer*(4096*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*4*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-        ctx_size += n_layer*(4096 * 4*1024*ggml_type_sizef(GGML_TYPE_F32));          // pre_w
-        ctx_size += n_layer*(4096*ggml_type_sizef(GGML_TYPE_F32));          // pre_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size = 0;
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*51200; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-    int main_gpu = 0;
-#if defined(GGML_USE_CUBLAS)
-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
-    ggml_cuda_set_main_device(main_gpu);
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_GPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
-#else
-#define OPT_BACKEND_OFFLOAD GGML_BACKEND_CPU
-#define OPT_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
-#endif
-    
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        // model.ln_f_g->backend = OPT_BACKEND_OFFLOAD;
-        // model.ln_f_b->backend = OPT_BACKEND_OFFLOAD;
-
-        // model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        // model.wpe     = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ctx+2);
-        // model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        
-        // model.lm_head->backend = OPT_BACKEND_OFFLOAD;
-
-        // map by name
-        model.tensors["output_norm.weight"] = &model.ln_f_g;
-        model.tensors["output_norm.bias"] = &model.ln_f_b;
-
-        model.tensors["tok_embeddings.weight"]     = &model.wte;
-        model.tensors["pos_embeddings.weight"]     = &model.wpe;
-        model.tensors["output.weight"] = &model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-            memset(&layer, 0, sizeof(gpt2_layer));
-
-        //     layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-        //     layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-        //     // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-        //     layer.c_attn_attn_q_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_k_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_attn_v_w = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_embd);
-        //     layer.c_attn_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        //     layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-        //     layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-        //     layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-        //     // need refine
-        //     layer.gpu_idx       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_embd * 4);
-        //     layer.gpu_bucket       = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2048*5);
-        //     layer.c_mlp_fc_w_gpu = ggml_new_tensor_2d(ctx, wtype,         n_embd, 2048*5);
-
-        //     layer.c_mlp_proj_w_t = ggml_new_tensor_2d(ctx, wtype,         n_embd, 4* n_embd);
-        //     layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-        //     layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-        //     layer.c_mlp_proj_w_gpu = ggml_new_tensor_2d(ctx, wtype,2048*5, n_embd);
-
-        //     if (i <= 10) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 192);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 192, 4*n_embd);
-        //     } else if (i <= 12) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 288);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 288, 4*n_embd);
-        //     } else if (i <= 18) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 512);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 512, 4*n_embd);
-
-        //     } else if (i <= 21) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 768);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 768, 4*n_embd);
-        //     } else if (i <= 26) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1024);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1024, 4*n_embd);
-        //     } else if (i <= 31) {
-        //         layer.mlp_pre_w1_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 1280);
-        //         layer.mlp_pre_w2_w = ggml_new_tensor_2d(ctx, wtype, 1280, 4*n_embd);
-        //     }
-
-        //     layer.ln_1_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_1_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_g->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.ln_2_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_q_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_k_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_attn_v_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_attn_proj_b->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_b->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_fc_w->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_b->backend = OPT_BACKEND_OFFLOAD;
-
-        //     layer.mlp_pre_w1_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.mlp_pre_w2_w->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_fc_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.c_mlp_proj_w_gpu->backend = OPT_BACKEND_OFFLOAD;
-        //     layer.gpu_bucket->backend = OPT_BACKEND_OFFLOAD;
-        //     // layer.c_mlp_proj_w_t->backend = OPT_BACKEND_OFFLOAD;
-
-            // map by name
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"]        = &layer.ln_1_g;
-            model.tensors["layers." + std::to_string(i) + ".attention_norm.bias"]        = &layer.ln_1_b;
-
-            model.tensors["layers." + std::to_string(i) + ".output_norm.weight"]        = &layer.ln_2_g;
-            model.tensors["layers." + std::to_string(i) + ".output_norm.bias"]        = &layer.ln_2_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = &layer.c_attn_attn_q_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wq.bias"] = &layer.c_attn_attn_q_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = &layer.c_attn_attn_k_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wk.bias"] = &layer.c_attn_attn_k_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = &layer.c_attn_attn_v_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wv.bias"] = &layer.c_attn_attn_v_b;
-
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = &layer.c_attn_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".attention.wo.bias"] = &layer.c_attn_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"]    = &layer.c_mlp_fc_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.bias"]    = &layer.c_mlp_fc_b;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"]  = &layer.c_mlp_proj_w;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_transpose"]  = &layer.c_mlp_proj_w_t;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.bias"]  = &layer.c_mlp_proj_b;
-
-            model.tensors["layers." + std::to_string(i) + ".gpu.weight"]    = &layer.gpu_idx;
-            model.tensors["layers." + std::to_string(i) + ".gpu.bucket"]    = &layer.gpu_bucket;
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight_h20"]    = &layer.c_mlp_fc_w_gpu;
-
-            model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight_h20"]    = &layer.c_mlp_proj_w_gpu;
-            
-            model.tensors["layers." + std::to_string(i) + ".fc1.weight"] = &layer.mlp_pre_w1_w;
-            model.tensors["layers." + std::to_string(i) + ".fc2.weight"] = &layer.mlp_pre_w2_w;
-        }
-    }
-
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        #ifdef GGML_USE_CUBLAS
-            ggml_cuda_assign_buffers_no_scratch(model.memory_k); 
-            ggml_cuda_assign_buffers_no_scratch(model.memory_v); 
-        #endif
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-    ggml_set_no_alloc(ctx, true);
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-        const std::vector<std::string> to_gpu = {
-                "output_norm.bias",
-                "output_norm.weight",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                // ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                "layers.*output_norm.weight",
-                "layers.*output_norm.bias",
-                ".*fc1.weight",
-                ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_gpu_lv = {
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                ".*attention.wk.weight",
-                ".*attention.wk.bias",
-                ".*attention.wv.weight",
-                ".*attention.wv.bias",
-                ".*attention.wo.weight",
-                ".*attention.wo.weight_transpose",
-                ".*attention.wo.bias",
-                ".*feed_forward.w1.weight_h20",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_h20$",
-                // ".*feed_forward.w2.weight_transpose",
-                /* ".*feed_forward.w2.weight$", */
-                ".*feed_forward.w2.bias",
-                ".*gpu.bucket",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                // "layers.*output_norm.weight",
-                // "layers.*output_norm.bias",
-                // ".*fc1.weight",
-                // ".*fc2.weight",
-                // ".*attention.*fc1.weight",
-                // ".*attention.*fc1.bias",
-                // ".*attention.*fc2.weight",
-                // ".*attention.*fc2.bias",
-
-                // "output.weight",
-                
-                // "model/h.*/attn/c_proj/w",
-                // "model/h.*/mlp/c_fc/w",
-                // "model/h.*/mlp/c_proj/w",
-            };
-            const std::vector<std::string> to_lock = {
-                "tok_embeddings.weight",
-                "pos_embeddings.weight",
-                // "output_norm.bias",
-                ".*attention.wq.weight",
-                ".*attention.wq.bias",
-                // ".*attention.wo.weight",
-                // ".*attention.wo.weight_transpose",
-                // ".*attention.wo.bias",
-                ".*feed_forward.w1.weight",
-                ".*feed_forward.w1.bias",
-                ".*feed_forward.w2.weight_transpose",
-                // ".*feed_forward.w2.weight",
-                ".*feed_forward.w2.bias",
-                ".*gpu.weight",
-                ".*attention_norm.weight",
-                ".*attention_norm.bias",
-                ".*output_norm.weight",
-                ".*output_norm.bias",
-                ".*attention.*fc1.weight",
-                ".*attention.*fc1.bias",
-                ".*attention.*fc2.weight",
-                ".*attention.*fc2.bias",
-                // ".*w2.bias",
-                // ".*w1.bias",
-                "output.weight",
-            };
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            int64_t new_ne[2];
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-                new_ne[i] = ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-            ggml_tensor ** ptr = model.tensors[name];
-            // printf("name %s ptr %p\n", name.c_str(), *ptr);
-            // int k;
-            // scanf("%d", &k);
-            *ptr = ggml_new_tensor(ctx, ggml_type(ttype), n_dims, (const int64_t *)&new_ne);
-
-            auto tensor = (ggml_tensor *)*model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file elements %d\n", __func__, name.c_str(), nelements);
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-            
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            std::streampos offset = fin.tellg();
-            // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-            tensor->data = model.mapping->addr + static_cast<std::streamoff>(offset);
-            // if ( endsWith(name.c_str(), "weight_transpose")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i+4096]);
-            //     }
-            // }
-            // printf("\n");
-            // if (endsWith(name.c_str(), "weight_h20")) {
-            //     short *d = (short *)tensor->data;
-            //     for (int i = 0; i < 10; i++) {
-            //         printf("%d ", d[i]);
-
-            //     }
-            //     int k;
-            //     scanf("%d", &k);
-            // }
-
-            // // GPT-2 models share the WTE tensor as the LM head
-            // if (name == "model/wte" && has_lm_head == false) {
-            //     memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            // }
-
-            // if (name == "model/lm_head") {
-            //     has_lm_head = true;
-            // }
-            if (model_params.low_vram == false) {
-                for (const auto &s : to_gpu)
-                {
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-            } else {
-                for (const auto &s : to_gpu_lv)
-                {
-                    if (std::regex_search(name, std::regex(s)))
-                    {
-                        std::regex pattern(R"(\d+)");
-                        std::smatch match;
-                        int layer_id = 0;
-                        if (std::regex_search(name, match, pattern))
-                        {
-                            std::string digitStr = match.str();
-                            int num = std::stoi(digitStr);
-                            layer_id = num;
-                        }
-                        // printf("layerid %d, ngpu_layers %d\n", layer_id, model_params.n_gpu_layers);
-                        if (layer_id > model_params.n_gpu_layers)
-                            break;
-                        // printf("name %s\n", name.c_str());
-                        tensor->backend = GGML_BACKEND_GPU;
-                        break;
-                    }
-                }
-
-            }
-            if (tensor->backend == GGML_BACKEND_GPU) {
-                #if defined(GGML_USE_CUBLAS)
-                ggml_cuda_transform_tensor(tensor->data, tensor);
-                #endif
-            }
-            for (const auto &s : to_lock)
-            {
-                if (std::regex_match(name, std::regex(s)))
-                {
-                    if(!mlock(tensor->data, ggml_nbytes(tensor))) {
-                        // printf("mlock %s\n", name.c_str());
-                    }
-                    else {
-                        printf("mlock failed %s\n", name.c_str());
-                    }
-                }
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-        ggml_set_no_alloc(ctx, false);
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
-    // static std::vector<uint8_t> buf(buf_size);
-    static void * buf = ggml_cuda_host_malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i + 2;
-        }
-    }
-    offload_func_t offload_func = opt_nop;
-    offload_func_t offload_func_kq = opt_nop;
-    offload_func_t offload_func_v = opt_nop;
-    offload_func_t offload_func_nr = opt_nop;
-    offload_func_t offload_debug = opt_nop;
-#ifdef GGML_USE_CUBLAS
-    offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    offload_func = ggml_cuda_assign_buffers_no_alloc; 
-    offload_func_kq = ggml_cuda_assign_buffers_no_alloc; 
-    offload_func_v = ggml_cuda_assign_buffers_no_alloc; 
-    offload_func_nr = ggml_cuda_assign_buffers_no_alloc; 
-#endif
-    // offload_func_t offload_debug = ggml_cuda_assign_buffers_no_alloc;
-    // int k; 
-    // scanf("%d", &k); 
-
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(allocr, KQ_scale);
-    if (!ggml_allocr_is_measure(allocr)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-    ggml_set_name(inpL, "inpL_first");
-    // offload_func(inpL);
-
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-            offload_func(cur);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g);
-            offload_func(cur);
-            ggml_set_name(cur, "ln_1_g");
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].ln_1_b);
-            ggml_set_name(cur, "ln_1_b");
-            // offload_func(cur);
-            
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-
-        struct ggml_tensor *k_cpy = nullptr;
-        struct ggml_tensor *v_cpy = nullptr;
-        // self-attention
-        {
-            // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_q_w,cur);
-            offload_func_kq(Qcur);
-            Qcur = ggml_add(ctx0, Qcur, model.layers[il].c_attn_attn_q_b);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_k_w,cur);
-            offload_func_kq(Kcur);
-            Kcur = ggml_add(ctx0, Kcur, model.layers[il].c_attn_attn_k_b);
-            offload_func_kq(Kcur);
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_v_w,cur);
-            offload_func_v(Vcur);
-            Vcur = ggml_add(ctx0, Vcur, model.layers[il].c_attn_attn_v_b);
-            offload_func_v(Vcur);
-
-            Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-            offload_func_v(Vcur);
-
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                offload_func_kq(k);
-                // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd+ n_past*ggml_element_size(model.memory_v));
-
-                offload_func_v(v);
-                k_cpy = ggml_cpy(ctx0, Kcur, k);
-                offload_func_kq(k_cpy);
-                ggml_set_name(k_cpy, "k_cpy");
-                v_cpy = ggml_cpy(ctx0, Vcur, v);
-                offload_func_v(v_cpy);
-                ggml_set_name(v_cpy, "v_cpy");
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N);
-            offload_func_kq(Qcur);
-             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-            offload_func_kq(Q);
-
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            // struct ggml_tensor * K =
-            //     ggml_permute(ctx0,
-            //             ggml_reshape_3d(ctx0,
-            //                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-            //                 n_embd/n_head, n_head, n_past + N),
-            //             0, 2, 1, 3);
-            
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, model.memory_k,
-                        128, n_past + N, n_head,
-                        ggml_element_size(model.memory_k)*n_embd,
-                        ggml_element_size(model.memory_k)*128,
-                        ggml_element_size(model.memory_k)*n_embd*n_ctx*il);
-            K->src[1] = k_cpy;
-            offload_func_kq(K);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        KQ_scale);
-            offload_func_kq(KQ_scaled);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            offload_func_kq(KQ_masked);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, 128, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*128,
-                        n_ctx*ggml_element_size(model.memory_k)*n_embd*il);
-            V->src[1] = v_cpy;
-            offload_func_v(V);
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-            ggml_set_name(cur, "KQV_merge_cont");
-            offload_func_v(cur);
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-            ggml_set_name(cur, "attn_proj");
-            offload_func(cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-            ggml_set_name(cur, "attn_bias");
-            offload_func(cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
-        ggml_set_name(cur, "after attn");
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            ggml_tensor *idx = nullptr;
-            ggml_tensor *idx_g = nullptr;
-            ggml_tensor *cur_c = nullptr;
-            
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN");
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g);
-                offload_func(cur);
-                ggml_set_name(cur, "norm_FFN_g");
-                cur = ggml_add(ctx0,
-                        cur, 
-                        model.layers[il].ln_2_b);
-                // offload_func(cur);
-                // ggml_set_name(cur, "norm_FFN_w");
-                // cur_c = ggml_dup(ctx0, cur);
-            }
-            // if (N == 1)
-            if (1)
-            {
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w1_w,
-                                   cur);
-                offload_func(idx);
-                ggml_set_name(idx, "mlp_pre_w1");
-                idx = ggml_relu(ctx0, idx);
-                offload_func(idx);
-                ggml_set_name(idx, "relu_pre");
-                idx = ggml_mul_mat(ctx0,
-                                   model.layers[il].mlp_pre_w2_w,
-                                   idx);
-                ggml_set_name(idx, "mlp_pre_w2");
-                // offload_func(idx);
-                // idx = ggml_sigmoid(ctx0, idx);
-                // offload_func(idx);
-                // idx_g = idx;
-                // idx = ggml_dup(ctx0, idx_g);
-                // ggml_set_name(idx, "idx_cpu_dup");
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            if (N >= 80)
-            // if (0)
-            {
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_fc_w,
-                                   cur);
-                offload_func(cur);
-                ggml_set_name(cur, "up_ffn");
-                cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-                offload_func(cur);
-            }
-            else 
-            {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_fc_w,
-                //                    cur);
-                // offload_func(cur);
-                // cur = ggml_add(ctx0,
-                //     cur,
-                //     model.layers[il].c_mlp_fc_b);
-                // offload_func(cur);
-
-                
-                struct ggml_tensor *tmp = ggml_mul_mat_special(ctx0,
-                model.layers[il].c_mlp_fc_w_gpu,
-                cur,
-                idx,
-                model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "mlp_up_gpu");
-                offload_func(tmp);
-                offload_debug(tmp);
-                cur = ggml_mul_mat_idx(ctx0,
-                                       model.layers[il].c_mlp_fc_w,
-                                       cur,
-                                       idx,
-                                       model.layers[il].gpu_idx);
-                ggml_set_name(cur, "mlp_up_cpu");
-                tmp = ggml_add_idx(ctx0,
-                    tmp,
-                    model.layers[il].c_mlp_fc_b,
-                    idx);
-                ggml_set_name(tmp, "mlp_up_bias");
-                offload_debug(tmp);
-                offload_func(tmp);
-
-            cur = ggml_add(ctx0, cur, tmp);
-            ggml_set_name(cur, "mlp_up_mix");
-            offload_func(cur);
-
-                // cur = tmp;
-
-            }
-
-            
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_relu(ctx0, cur);
-            // cur_c = cur;
-            // offload_func(cur);
-            cur_c = cur->backend==GGML_BACKEND_CPU? cur : ggml_dup(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            if (N >= 80) {
-            // if (0) { 
-                cur = ggml_mul_mat(ctx0,
-                                   model.layers[il].c_mlp_proj_w,
-                                   cur);
-                offload_func(cur);
-                ggml_set_name(cur, "down_ffn");
-
-                cur = ggml_add(ctx0,
-                               cur,
-                               model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-            }
-            else {
-                // cur = ggml_mul_mat(ctx0,
-                //                    model.layers[il].c_mlp_proj_w,
-                //                    cur);
-                // offload_func(cur);
-                
-                // cur = ggml_axpy(ctx0, 
-                // model.layers[il].c_mlp_proj_w_t,
-                // cur,
-                // NULL,
-                // NULL);
-                // offload_func(cur);
-
-
-                // struct ggml_tensor *tmp = ggml_mul_mat_idx(ctx0, 
-                // model.layers[il].c_mlp_proj_w_gpu,
-                // cur,
-                // model.layers[il].gpu_bucket,
-                // NULL);
-                struct ggml_tensor *tmp = ggml_axpy(ctx0, 
-                    model.layers[il].c_mlp_proj_w_gpu,
-                    cur,
-                    idx,
-                    model.layers[il].gpu_bucket);
-                ggml_set_name(tmp, "axpy");
-                offload_func(tmp);
-                offload_debug(tmp);
-
-                cur = ggml_axpy(ctx0, 
-                model.layers[il].c_mlp_proj_w_t,
-                cur_c,
-                idx,
-                model.layers[il].gpu_idx);
-
-                cur = ggml_add(ctx0, cur, tmp);
-                offload_func(cur);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].c_mlp_proj_b);
-                offload_func(cur);
-                
-                // tmp = ggml_add(ctx0,
-                //                tmp,
-                //                model.layers[il].c_mlp_proj_b);
-                // offload_func(tmp);
-                // offload_debug(tmp);
-
-                // cur = tmp;
-            }
-            
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-        offload_func(inpL);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-        offload_func_nr(inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g);
-        offload_func_nr(inpL);
-        inpL = ggml_add(ctx0,
-                inpL,
-                model.ln_f_b);
-        ggml_set_name(inpL, "before");
-        offload_func_nr(inpL);
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-    ggml_set_name(inpL, "last_layer");
-// offload_func(inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-#ifdef GGML_USE_CUBLAS
-    for (int i = 0; i < gf->n_leafs; i++) {
-        ggml_tensor * node = gf->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            // ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer.data());
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) compute_buffer);
-        }
-    }
-#endif
-
-
-
-    // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, "hello world");
-    }
-    printf("load finish\n");
-
-    // keep this buffer alive while evaluating the model
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        // compute_buffer.resize(mem_size);
-        compute_buffer = ggml_cuda_host_malloc(mem_size);
-        // allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
-        allocr = ggml_allocr_new(compute_buffer, mem_size, GGML_MEM_ALIGN);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    int cnt = 0;
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-            cnt += 1;
-
-            if (cnt > 0)
-                t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            llama_sampling_params & sparams = params.sparams;
-            const int   top_k = sparams.top_k;
-            const float top_p = sparams.top_p;
-            const float temp  = sparams.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/(cnt));
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/gpt-2-sparse/quantize.cpp b/examples/gpt-2-sparse/quantize.cpp
deleted file mode 100644
index f81c04e8..00000000
--- a/examples/gpt-2-sparse/quantize.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-#include "ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gpt2_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        "model/wte",
-        "model/lm_head",
-        "model/h.*/attn/c_attn/w",
-        "model/h.*/attn/c_proj/w",
-        "model/h.*/mlp/c_fc/w",
-        "model/h.*/mlp/c_proj/w",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}
diff --git a/llama.cpp b/llama.cpp
index 3ae9e946..2a9ea030 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -230,6 +230,7 @@ enum llm_arch {
     LLM_ARCH_GPT2,
     LLM_ARCH_GPTJ,
     LLM_ARCH_GPTNEOX,
+    LLM_ARCH_OPT,
     LLM_ARCH_MPT,
     LLM_ARCH_STARCODER,
     LLM_ARCH_PERSIMMON,
@@ -246,6 +247,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_GPT2,            "gpt2"      },
     { LLM_ARCH_GPTJ,            "gptj"      },
     { LLM_ARCH_GPTNEOX,         "gptneox"   },
+    { LLM_ARCH_OPT,              "opt"      },
     { LLM_ARCH_MPT,             "mpt"       },
     { LLM_ARCH_BAICHUAN,        "baichuan"  },
     { LLM_ARCH_STARCODER,       "starcoder" },
@@ -483,6 +485,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_OPT,
+        {
+            {LLM_TENSOR_TOKEN_EMBD,       "token_embd"},
+            {LLM_TENSOR_POS_EMBD,        "position_embd"},
+            {LLM_TENSOR_OUTPUT_NORM,      "output_norm"},
+            {LLM_TENSOR_OUTPUT,           "output"},
+            {LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm"},
+            {LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q"},
+            {LLM_TENSOR_ATTN_K,          "blk.%d.attn_k"},
+            {LLM_TENSOR_ATTN_V,          "blk.%d.attn_v"},
+            {LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output"},
+            {LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm"},
+            {LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down"},
+            {LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up"},
+        },
+    },
     {
         LLM_ARCH_PERSIMMON,
         {
@@ -1321,6 +1340,9 @@ struct llama_layer {
     struct ggml_tensor * wqkv;
 
     // attention bias
+    struct ggml_tensor * bq;
+    struct ggml_tensor * bk;
+    struct ggml_tensor * bv;
     struct ggml_tensor * bo;
     struct ggml_tensor * bqkv;
 
@@ -2341,6 +2363,17 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_OPT:
+            {
+                // TODO: GGUF_GET_KEY & support different model versions
+                hparams.n_ctx_train = 2050;  // TODO: hard coded for now
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    case 48: model.type = e_model::MODEL_30B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_FALCON:
             {
                 GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
@@ -3229,6 +3262,10 @@ static void llm_load_sparse_model_tensors(
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
+            case LLM_ARCH_OPT:
+                {
+                    // TODO: load sparse tensor model
+                } break;
             case LLM_ARCH_FALCON:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -3482,6 +3519,81 @@ static void llm_load_tensors(
                         }
                     }
                 } break;
+            case LLM_ARCH_OPT:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);           
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+                            // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+                            backend_norm = llama_backend_offload;
+#else
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
+#endif // _WIN32
+
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+                        // model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, backend_output); // same as token_embed
+
+                        if (backend_norm == GGML_BACKEND_GPU) {
+                            vram_weights += ggml_nbytes(model.output_norm);
+                        }
+                        // if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+                        //     vram_weights += ggml_nbytes(model.output);
+                        // }
+                    }
+                    const uint32_t n_ff = hparams.n_ff;
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+                    model.layers.resize(n_layer);
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
+
+                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd},             backend_split);
+                        
+                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "bias", i),   {n_embd_gqa},         backend_split);
+
+                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa},         backend_split);
+
+                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},             backend_split);
+
+                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
+
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, backend_split);
+
+                        if (backend == GGML_BACKEND_GPU) {
+                            vram_weights +=
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)       + ggml_nbytes(layer.wk)       +
+                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)       + ggml_nbytes(layer.ffn_norm) +
+                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_up);
+                        }
+                    }
+                } break;
             case LLM_ARCH_BAICHUAN:
                 {
                     model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
@@ -4928,6 +5040,103 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_opt() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        cb(KQ_scale, "KQ_scale", -1);
+
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+
+                std::tie(k_cpy, v_cpy) = llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+            }
+            // add input residual
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                       model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                       LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up, model.layers[il].ffn_up_b,
+                        NULL,                    NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        LLM_FFN_RELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+            // input for next layer
+            inpL = cur;
+        }
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+        return gf;
+    }
+
     struct ggml_cgraph * build_baichuan() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -6440,6 +6649,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_stablelm();
             } break;
+        case LLM_ARCH_OPT:
+            {
+                result = llm.build_opt();
+            } break;
         default:
             GGML_ASSERT(false);
     }

From bc6e190bc1136c026da530eff784e726741e68ce Mon Sep 17 00:00:00 2001
From: "a.r.l" <luminyouyazj@126.com>
Date: Thu, 23 Jan 2025 21:16:20 +0800
Subject: [PATCH 3/4] feat: add sparse inference of opt

---
 convert-hf-to-powerinfer-gguf.py | 60 +++++++++++++++++++++++++
 gguf-py/gguf/constants.py        | 17 ++++++-
 gguf-py/gguf/tensor_mapping.py   | 11 +++++
 llama.cpp                        | 76 +++++++++++++++++++++++++++++---
 4 files changed, 158 insertions(+), 6 deletions(-)

diff --git a/convert-hf-to-powerinfer-gguf.py b/convert-hf-to-powerinfer-gguf.py
index 181fe972..28d77bdd 100644
--- a/convert-hf-to-powerinfer-gguf.py
+++ b/convert-hf-to-powerinfer-gguf.py
@@ -185,6 +185,8 @@ def from_model_architecture(model_architecture):
             return FalconModel
         if model_architecture == "LlamaForCausalLM":
             return LlamaModel
+        if model_architecture == "OPTForCausalLM":
+            return OptModel
 
         raise NotImplementedError(f'Architecture "{model_architecture}" not supported!')
 
@@ -218,6 +220,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
             return gguf.MODEL_ARCH.FALCON
         if arch == "RWForCausalLM" or arch == "LlamaForCausalLM":
             return gguf.MODEL_ARCH.LLAMA
+        if arch == "OPTForCausalLM":
+            return gguf.MODEL_ARCH.OPT
 
         raise NotImplementedError(f'Architecture "{arch}" not supported!')
 
@@ -513,7 +517,63 @@ def write_tensors(self):
 
             self.gguf_writer.add_tensor(new_name, data)
 
+class OptModel(Model):
+    def set_gguf_parameters(self, params: PredictorParams):
+        self.gguf_writer.add_name("opt")
+        self.gguf_writer.add_context_length(2048)  # not in config.json
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["ffn_dim"])
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        # self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        self.gguf_writer.add_file_type(self.ftype)
+        
+        if params.sparse_threshold is not None:
+            self.gguf_writer.add_sparse_threshold(params.sparse_threshold)
+
+    def write_tensors(self):
+        for name, data_torch in self.get_tensors():
+            old_dtype = data_torch.dtype
+            
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+                
+            data = data_torch.squeeze().numpy()
+            
+            # map tensor names
+            new_name = self._translate_tensor_key(name)
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            # We need to transpose the weight matrices for the FFN Down layers to support the
+            # Axpy operation in PowerInfer. So we don't need to transpose them at runtime.
+            if "ffn_down" in new_name:
+                new_name = new_name.replace("ffn_down", "ffn_down_t")
+                data = data.T
+            
+            n_dims = len(data.shape)    
+            data_dtype = data.dtype
 
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if (
+                self.ftype == 1
+                and data_dtype == np.float32
+                and name.endswith(".weight")
+                and n_dims == 2
+            ):
+                data = data.astype(np.float16)
+                
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            
+            self.gguf_writer.add_tensor(new_name, data)
 
 @dataclass
 class PredictorParams:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index e82df27b..9459b477 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -90,6 +90,7 @@ class MODEL_ARCH(IntEnum):
     GPT2      = auto()
     GPTJ      = auto()
     GPTNEOX   = auto()
+    OPT       = auto()
     MPT       = auto()
     STARCODER = auto()
     PERSIMMON = auto()
@@ -135,6 +136,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GPT2:           "gpt2",
     MODEL_ARCH.GPTJ:           "gptj",
     MODEL_ARCH.GPTNEOX:        "gptneox",
+    MODEL_ARCH.OPT:            "opt",
     MODEL_ARCH.MPT:            "mpt",
     MODEL_ARCH.STARCODER:      "starcoder",
     MODEL_ARCH.PERSIMMON:      "persimmon",
@@ -356,7 +358,20 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GPT2: [
         # TODO
     ],
-    # TODO
+    MODEL_ARCH.OPT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
 }
 
 # tensors that will not be serialized
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 2c813050..641b81f0 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -11,6 +11,7 @@ class TensorNameMap:
         MODEL_TENSOR.TOKEN_EMBD: (
             "gpt_neox.embed_in",                         # gptneox
             "transformer.wte",                           # gpt2 gpt-j mpt refact
+            "decoder.embed_tokens",                      # opt
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
             "model.embed_tokens",                        # llama-hf
@@ -33,6 +34,7 @@ class TensorNameMap:
         MODEL_TENSOR.POS_EMBD: (
             "transformer.wpe",                 # gpt2
             "embeddings.position_embeddings",  # bert
+            "decoder.embed_positions",         # opt
         ),
 
         # Output
@@ -47,6 +49,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT_NORM: (
             "gpt_neox.final_layer_norm",               # gptneox
             "transformer.ln_f",                        # gpt2 gpt-j falcon
+            "decoder.final_layer_norm",                # opt
             "model.norm",                              # llama-hf baichuan
             "norm",                                    # llama-pth
             "embeddings.LayerNorm",                    # bert
@@ -66,6 +69,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_NORM: (
             "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
             "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact
+            "decoder.layers.{bid}.self_attn_layer_norm",            # opt
             "transformer.blocks.{bid}.norm_1",                      # mpt
             "transformer.h.{bid}.input_layernorm",                  # falcon7b
             "h.{bid}.input_layernorm",                              # bloom
@@ -98,6 +102,7 @@ class TensorNameMap:
             "layers.{bid}.attention.wq",                 # llama-pth
             "encoder.layer.{bid}.attention.self.query",  # bert
             "transformer.h.{bid}.attn.q_proj",           # gpt-j
+            "decoder.layers.{bid}.self_attn.q_proj",     # opt
         ),
 
         # Attention key
@@ -106,6 +111,7 @@ class TensorNameMap:
             "layers.{bid}.attention.wk",               # llama-pth
             "encoder.layer.{bid}.attention.self.key",  # bert
             "transformer.h.{bid}.attn.k_proj",         # gpt-j
+            "decoder.layers.{bid}.self_attn.k_proj",   # opt
         ),
 
         # Attention value
@@ -114,12 +120,14 @@ class TensorNameMap:
             "layers.{bid}.attention.wv",                 # llama-pth
             "encoder.layer.{bid}.attention.self.value",  # bert
             "transformer.h.{bid}.attn.v_proj",           # gpt-j
+            "decoder.layers.{bid}.self_attn.v_proj",     # opt
         ),
 
         # Attention output
         MODEL_TENSOR.ATTN_OUT: (
             "gpt_neox.layers.{bid}.attention.dense",                     # gptneox
             "transformer.h.{bid}.attn.c_proj",                           # gpt2 refact
+            "decoder.layers.{bid}.self_attn.out_proj",                   # opt
             "transformer.blocks.{bid}.attn.out_proj",                    # mpt
             "transformer.h.{bid}.self_attention.dense",                  # falcon
             "h.{bid}.self_attention.dense",                              # bloom
@@ -140,6 +148,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_NORM: (
             "gpt_neox.layers.{bid}.post_attention_layernorm",                # gptneox
             "transformer.h.{bid}.ln_2",                                      # gpt2 refact
+            "decoder.layers.{bid}.final_layer_norm",                         # opt
             "h.{bid}.post_attention_layernorm",                              # bloom
             "transformer.blocks.{bid}.norm_2",                               # mpt
             "model.layers.{bid}.post_attention_layernorm",                   # llama-hf
@@ -153,6 +162,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_UP: (
             "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
             "transformer.h.{bid}.mlp.c_fc",                           # gpt2
+            "decoder.layers.{bid}.fc1",                               # opt
             "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
             "h.{bid}.mlp.dense_h_to_4h",                              # bloom
@@ -173,6 +183,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_DOWN: (
             "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
             "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact
+            "decoder.layers.{bid}.fc2",                               # opt
             "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
             "h.{bid}.mlp.dense_4h_to_h",                              # bloom
diff --git a/llama.cpp b/llama.cpp
index 2a9ea030..aad7f9c4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -247,7 +247,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_GPT2,            "gpt2"      },
     { LLM_ARCH_GPTJ,            "gptj"      },
     { LLM_ARCH_GPTNEOX,         "gptneox"   },
-    { LLM_ARCH_OPT,              "opt"      },
+    { LLM_ARCH_OPT,             "opt"       },
     { LLM_ARCH_MPT,             "mpt"       },
     { LLM_ARCH_BAICHUAN,        "baichuan"  },
     { LLM_ARCH_STARCODER,       "starcoder" },
@@ -499,7 +499,10 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             {LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output"},
             {LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm"},
             {LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down"},
+            {LLM_TENSOR_FFN_DOWN_T,      "blk.%d.ffn_down_t"},
             {LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up"},
+            { LLM_TENSOR_MLP_PRED_FC1,    "blk.%d.fc1" },
+            { LLM_TENSOR_MLP_PRED_FC2,    "blk.%d.fc2" },
         },
     },
     {
@@ -3264,7 +3267,47 @@ static void llm_load_sparse_model_tensors(
                 } break;
             case LLM_ARCH_OPT:
                 {
-                    // TODO: load sparse tensor model
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
+                    // output
+                    {
+                       model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                       model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+                    //    model.output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
+                    }
+
+                    const uint32_t n_ff = hparams.n_ff;
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t &i = current_layer; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd});
+
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa});
+
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa});
+
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+
+                        layer.ffn_down_t = create_tensor(tn(LLM_TENSOR_FFN_DOWN_T, "weight", i), {n_embd, n_ff});
+                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_T, "bias", i), {n_embd}); 
+
+                        layer.mlp_pre_w1 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC1, "weight", i), {n_embd, GGML_NE_WILDCARD});
+                        layer.mlp_pre_w2 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC2, "weight", i), {GGML_NE_WILDCARD, n_ff});
+
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff});
+                    }
                 } break;
             case LLM_ARCH_FALCON:
                 {
@@ -5110,14 +5153,37 @@ struct llm_build_context {
                 cur = llm_build_norm(ctx0, ffn_inp, hparams,
                        model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
                        LLM_NORM, cb, il);
-                cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, cur,
+                if(llama_use_sparse_inference(&model)) {
+                    llm_build_cb_short cbs = [&](ggml_tensor * cur, const char * name) {
+                        std::string name_str = std::string(name) + "-" + std::to_string(il);
+                        ggml_set_name(cur, name_str.c_str());
+                    };
+                    // We only offload the ffn input to GPU if all neurons are offloaded
+                    if (model.layers[il].gpu_offload_ratio >= 1.) {
+                        cb(cur, "ffn_norm", il);
+                    } else {
+                        cbs(cur, "ffn_norm");
+                    }
+                    cur = llm_build_ffn_sparse(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down_t, model.layers[il].ffn_down_b,
+                        model.layers[il].mlp_pre_w1,
+                        model.layers[il].mlp_pre_w2,
+                        ffn_inp, 
+                        model.layers[il].gpu_idx,
+                        model.layers[il].gpu_bucket, model.layers[il].ffn_gate_gpu, model.layers[il].ffn_down_gpu, model.layers[il].ffn_up_gpu,
+                        LLM_FFN_RELU, LLM_FFN_SEQ, model.layers[il].gpu_offload_ratio, cbs);
+                } else {
+                    cb(cur, "ffn_norm", il);
+                    cur = llm_build_ffn(ctx0, cur,
                         model.layers[il].ffn_up, model.layers[il].ffn_up_b,
                         NULL,                    NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                         LLM_FFN_RELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
+                    cb(cur, "ffn_out", il);
+                }
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);

From 7066d20aa179dd15e9c5c18c4af13719f3e0f81c Mon Sep 17 00:00:00 2001
From: "a.r.l" <luminyouyazj@126.com>
Date: Tue, 18 Feb 2025 21:12:29 +0800
Subject: [PATCH 4/4] feat: fix sparse problems

---
 common/common-ggml.cpp                   | 253 -----------------------
 common/common-ggml.h                     |  18 --
 convert-hf-to-powerinfer-gguf.py         |   2 +-
 llama.cpp                                |   3 +
 powerinfer-py/powerinfer/export_split.py |   5 +-
 5 files changed, 8 insertions(+), 273 deletions(-)
 delete mode 100644 common/common-ggml.cpp
 delete mode 100644 common/common-ggml.h

diff --git a/common/common-ggml.cpp b/common/common-ggml.cpp
deleted file mode 100644
index 794607c6..00000000
--- a/common/common-ggml.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-#include "common-ggml.h"
-
-#include <regex>
-#include <map>
-
-static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
-    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
-    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
-    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
-    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
-    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
-};
-
-void ggml_print_ftypes(FILE * fp) {
-    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
-        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-    }
-}
-
-enum ggml_ftype ggml_parse_ftype(const char * str) {
-    enum ggml_ftype ftype;
-    if (str[0] == 'q') {
-        const auto it = GGML_FTYPE_MAP.find(str);
-        if (it == GGML_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
-            return GGML_FTYPE_UNKNOWN;
-        }
-        ftype = it->second;
-    } else {
-        ftype = (enum ggml_ftype) atoi(str);
-    }
-
-    return ftype;
-}
-
-bool ggml_common_quantize_0(
-        std::ifstream & finp,
-        std::ofstream & fout,
-        const ggml_ftype ftype,
-        const std::vector<std::string> & to_quant,
-        const std::vector<std::string> & to_skip) {
-
-    ggml_type qtype = GGML_TYPE_F32;
-
-    switch (ftype) {
-        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
-        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
-        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
-        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
-        case GGML_FTYPE_UNKNOWN:
-        case GGML_FTYPE_ALL_F32:
-        case GGML_FTYPE_MOSTLY_F16:
-        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        case GGML_FTYPE_MOSTLY_Q2_K:
-        case GGML_FTYPE_MOSTLY_Q3_K:
-        case GGML_FTYPE_MOSTLY_Q4_K:
-        case GGML_FTYPE_MOSTLY_Q5_K:
-        case GGML_FTYPE_MOSTLY_Q6_K:
-                {
-                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
-                    return false;
-                }
-    };
-
-    if (!ggml_is_quantized(qtype)) {
-        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
-        return false;
-    }
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-
-    std::vector<float> work;
-
-    std::vector<uint8_t>     data_u8;
-    std::vector<ggml_fp16_t> data_f16;
-    std::vector<float>       data_f32;
-
-    std::vector<int64_t> hist_all(1 << 4, 0);
-
-    while (true) {
-        int32_t n_dims;
-        int32_t length;
-        int32_t ttype;
-
-        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
-        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-        if (finp.eof()) {
-            break;
-        }
-
-        int32_t nelements = 1;
-        int32_t ne[4] = { 1, 1, 1, 1 };
-        for (int i = 0; i < n_dims; ++i) {
-            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-            nelements *= ne[i];
-        }
-
-        std::string name(length, 0);
-        finp.read (&name[0], length);
-
-        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
-
-        bool quantize = false;
-
-        // check if we should quantize this tensor
-        for (const auto & s : to_quant) {
-            if (std::regex_match(name, std::regex(s))) {
-                quantize = true;
-                break;
-            }
-        }
-
-        // check if we should skip this tensor
-        for (const auto & s : to_skip) {
-            if (std::regex_match(name, std::regex(s))) {
-                quantize = false;
-                break;
-            }
-        }
-
-        // quantize only 2D tensors
-        quantize &= (n_dims == 2);
-
-        if (quantize) {
-            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
-                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
-                return false;
-            }
-
-            if (ttype == GGML_TYPE_F16) {
-                data_f16.resize(nelements);
-                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
-                data_f32.resize(nelements);
-                for (int i = 0; i < nelements; ++i) {
-                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
-                }
-            } else {
-                data_f32.resize(nelements);
-                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
-            }
-
-            ttype = qtype;
-        } else {
-            // const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
-            int bpe = -1;
-            if (ttype == 0 || ttype == 18) {
-                bpe = sizeof(float);
-            }
-            else {
-                bpe = sizeof(uint16_t);
-            }
-
-            data_u8.resize(nelements*bpe);
-            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
-        }
-
-        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
-        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-        for (int i = 0; i < n_dims; ++i) {
-            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-        }
-        fout.write(&name[0], length);
-
-        if (quantize) {
-            work.resize(nelements); // for quantization
-
-            size_t cur_size = 0;
-            std::vector<int64_t> hist_cur(1 << 4, 0);
-
-            switch ((ggml_type) ttype) {
-                case GGML_TYPE_Q4_0:
-                    {
-                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q4_1:
-                    {
-                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q5_0:
-                    {
-                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q5_1:
-                    {
-                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q8_0:
-                    {
-                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_I8:
-                case GGML_TYPE_I16:
-                case GGML_TYPE_I32:
-                case GGML_TYPE_Q8_1:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q6_K:
-                case GGML_TYPE_Q8_K:
-                case GGML_TYPE_COUNT:
-                    {
-                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
-                        return false;
-                    }
-            }
-
-            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
-            total_size_new += cur_size;
-
-            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                hist_all[i] += hist_cur[i];
-            }
-
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                printf("%5.3f ", hist_cur[i] / (float)nelements);
-            }
-            printf("\n");
-        } else {
-            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
-            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
-            total_size_new += data_u8.size();
-        }
-
-        total_size_org += nelements * sizeof(float);
-    }
-
-    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
-
-    {
-        int64_t sum_all = 0;
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            sum_all += hist_all[i];
-        }
-
-        printf("%s: hist: ", __func__);
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            printf("%5.3f ", hist_all[i] / (float)sum_all);
-        }
-        printf("\n");
-    }
-
-    return true;
-}
diff --git a/common/common-ggml.h b/common/common-ggml.h
deleted file mode 100644
index 29ba4ad5..00000000
--- a/common/common-ggml.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#include <fstream>
-#include <vector>
-#include <string>
-
-enum ggml_ftype ggml_parse_ftype(const char * str);
-
-void ggml_print_ftypes(FILE * fp = stderr);
-
-bool ggml_common_quantize_0(
-        std::ifstream & finp,
-        std::ofstream & fout,
-        const ggml_ftype ftype,
-        const std::vector<std::string> & to_quant,
-        const std::vector<std::string> & to_skip);
\ No newline at end of file
diff --git a/convert-hf-to-powerinfer-gguf.py b/convert-hf-to-powerinfer-gguf.py
index 28d77bdd..0aa4632e 100644
--- a/convert-hf-to-powerinfer-gguf.py
+++ b/convert-hf-to-powerinfer-gguf.py
@@ -520,7 +520,7 @@ def write_tensors(self):
 class OptModel(Model):
     def set_gguf_parameters(self, params: PredictorParams):
         self.gguf_writer.add_name("opt")
-        self.gguf_writer.add_context_length(2048)  # not in config.json
+        self.gguf_writer.add_context_length(2050)  # not in config.json
         self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
         self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
         self.gguf_writer.add_feed_forward_length(self.hparams["ffn_dim"])
diff --git a/llama.cpp b/llama.cpp
index aad7f9c4..ac52908a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6483,6 +6483,9 @@ static struct ggml_cgraph * llama_build_graph(
 
                 for (int i = 0; i < n_tokens; ++i) {
                     data[i] = batch.pos[i];
+                    if(model.arch == LLM_ARCH_OPT) {
+                        data[i] += 2;
+                    }
                 }
             }
 
diff --git a/powerinfer-py/powerinfer/export_split.py b/powerinfer-py/powerinfer/export_split.py
index 9a773b26..7f230d8c 100644
--- a/powerinfer-py/powerinfer/export_split.py
+++ b/powerinfer-py/powerinfer/export_split.py
@@ -1,11 +1,14 @@
 import argparse
 import pickle
-import gguf
+import sys
 from gguf.constants import GGMLQuantizationType
 from gguf.gguf_writer import GGUFWriter
 import torch
 from pathlib import Path
 import os
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
 import struct
 import numpy as np
 import re