diff --git a/.gitignore b/.gitignore
index f2cb1a2..e4454a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,6 +172,8 @@ cython_debug/
 
 # data
 *.csv
+!example/rlhf/mlflow/input_rw/ranking.csv
+!example/rlhf/ranking.csv
 rlhf_tests
 test_data
 example/flex/docs
@@ -179,6 +181,12 @@ example/flex/flex.ipynb
 example/flex/flex_sample/
 example/flex/vectordb/
 
+# mlflow
+example/rlhf/mlflow/mlruns/
+example/rlhf/mlruns/
+mlruns/
+ppo_log_stats.json
+
 # mac
 .DS_Store
 
@@ -193,4 +201,4 @@ models/
 temp/
 
 # poetry
-poetry.lock
\ No newline at end of file
+poetry.lock
diff --git a/example/rlhf/demo_reward.ipynb b/example/rlhf/demo_reward.ipynb
index d84b90f..dfe3c3e 100644
--- a/example/rlhf/demo_reward.ipynb
+++ b/example/rlhf/demo_reward.ipynb
@@ -4,15 +4,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/home/ubuntu/git/pykoi/example\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# %reload_ext autoreload\n",
     "# %autoreload 2\n",
@@ -34,23 +26,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
    ],
    "source": [
     "import pandas as pd\n",
-    "\n",
     "from pykoi.rlhf import RLHFConfig\n",
-    "from pykoi.rlhf import SupervisedFinetuning"
+    "from pykoi.rlhf import RewardFinetuning\n",
+    "from pykoi.chat import RankingDatabase"
    ]
   },
   {
@@ -62,10 +54,188 @@
     "Let's take a look of the QA data and process it for training."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>up_ranking_answer</th>\n",
+       "      <th>low_ranking_answer</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>how are u?</td>\n",
+       "      <td>i am fine</td>\n",
+       "      <td>I'm fine</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      question   up_ranking_answer  \\\n",
+       "id                                                   \n",
+       "1                   who are u?          Unemployed   \n",
+       "2   what is the weather today?   it is rainy today   \n",
+       "3                   who are u?          Unemployed   \n",
+       "4   what is the weather today?   it is rainy today   \n",
+       "5                   how are u?           i am fine   \n",
+       "\n",
+       "                                   low_ranking_answer  \n",
+       "id                                                     \n",
+       "1                                          I'm a girl  \n",
+       "2    The weather today is sunny with a high of 75 ...  \n",
+       "3                                          I'm a girl  \n",
+       "4    The weather today is sunny with a high of 75 ...  \n",
+       "5                                            I'm fine  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stack_exchange_df = pd.read_csv(\"ranking.csv\", index_col=0)\n",
+    "stack_exchange_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "My ranking database has 30 samples\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['question', 'up_ranking_answer', 'low_ranking_answer'],\n",
+       "    num_rows: 30\n",
+       "})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "print(\"My ranking database has {} samples\".format(stack_exchange_df.shape[0]))\n",
+    "dataset = Dataset.from_dict(stack_exchange_df)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RANKING_CSV_HEADER_ID = 'id'\n",
+    "RANKING_CSV_HEADER_QUESTION = 'question'\n",
+    "RANKING_CSV_HEADER_UP_RANKING_ANSWER = 'up_ranking_answer'\n",
+    "RANKING_CSV_HEADER_LOW_RANKING_ANSWER = 'low_ranking_answer'\n",
+    "RANKING_CSV_HEADER = (\n",
+    "    RANKING_CSV_HEADER_ID,\n",
+    "    RANKING_CSV_HEADER_QUESTION,\n",
+    "    RANKING_CSV_HEADER_UP_RANKING_ANSWER,\n",
+    "    RANKING_CSV_HEADER_LOW_RANKING_ANSWER\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pykoi.chat.db.ranking_database.RankingDatabase at 0x7fb8d7e090f0>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ranking_database = RankingDatabase()\n",
+    "for row in stack_exchange_df.iloc[:].to_dict('records'):\n",
+    "    ranking_id = ranking_database.insert_ranking(question=row[RANKING_CSV_HEADER_QUESTION],\n",
+    "                                       up_ranking_answer=row[RANKING_CSV_HEADER_UP_RANKING_ANSWER],\n",
+    "                                       low_ranking_answer=row[RANKING_CSV_HEADER_LOW_RANKING_ANSWER]\n",
+    "                                       )\n",
+    "ranking_database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -90,87 +260,74 @@
        "      <th></th>\n",
        "      <th>ID</th>\n",
        "      <th>Question</th>\n",
-       "      <th>Answer</th>\n",
-       "      <th>Vote Status</th>\n",
-       "      <th>Timestamp</th>\n",
+       "      <th>Up Ranking Answer</th>\n",
+       "      <th>Low Ranking Answer</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1</td>\n",
-       "      <td>who founded YC and what does YC do</td>\n",
-       "      <td>Y Combinator is a startup accelerator that pro...</td>\n",
-       "      <td>down</td>\n",
-       "      <td>2023-07-24 08:28:29.461045</td>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2</td>\n",
-       "      <td>what are the top YC companies</td>\n",
-       "      <td>I am looking for a list of the top YC companie...</td>\n",
-       "      <td>down</td>\n",
-       "      <td>2023-07-24 08:29:17.521467</td>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>3</td>\n",
-       "      <td>How much does top 10 YC companies worth</td>\n",
-       "      <td>According to the latest Crunchbase data, the 1...</td>\n",
-       "      <td>down</td>\n",
-       "      <td>2023-07-24 08:29:58.248998</td>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>4</td>\n",
-       "      <td>What's the meaning of life</td>\n",
-       "      <td>The meaning of life is to be happy, to be usef...</td>\n",
-       "      <td>down</td>\n",
-       "      <td>2023-07-24 19:26:04.713297</td>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>5</td>\n",
-       "      <td>What is the meaning of life</td>\n",
-       "      <td>The meaning of life is to find your gift. The ...</td>\n",
-       "      <td>up</td>\n",
-       "      <td>2023-07-24 19:26:28.676427</td>\n",
+       "      <td>how are u?</td>\n",
+       "      <td>i am fine</td>\n",
+       "      <td>I'm fine</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   ID                                 Question  \\\n",
-       "0   1       who founded YC and what does YC do   \n",
-       "1   2            what are the top YC companies   \n",
-       "2   3  How much does top 10 YC companies worth   \n",
-       "3   4               What's the meaning of life   \n",
-       "4   5              What is the meaning of life   \n",
-       "\n",
-       "                                              Answer Vote Status  \\\n",
-       "0  Y Combinator is a startup accelerator that pro...        down   \n",
-       "1  I am looking for a list of the top YC companie...        down   \n",
-       "2  According to the latest Crunchbase data, the 1...        down   \n",
-       "3  The meaning of life is to be happy, to be usef...        down   \n",
-       "4  The meaning of life is to find your gift. The ...          up   \n",
+       "   ID                    Question   Up Ranking Answer  \\\n",
+       "0   1                  who are u?          Unemployed   \n",
+       "1   2  what is the weather today?   it is rainy today   \n",
+       "2   3                  who are u?          Unemployed   \n",
+       "3   4  what is the weather today?   it is rainy today   \n",
+       "4   5                  how are u?           i am fine   \n",
        "\n",
-       "                    Timestamp  \n",
-       "0  2023-07-24 08:28:29.461045  \n",
-       "1  2023-07-24 08:29:17.521467  \n",
-       "2  2023-07-24 08:29:58.248998  \n",
-       "3  2023-07-24 19:26:04.713297  \n",
-       "4  2023-07-24 19:26:28.676427  "
+       "                                  Low Ranking Answer  \n",
+       "0                                         I'm a girl  \n",
+       "1   The weather today is sunny with a high of 75 ...  \n",
+       "2                                         I'm a girl  \n",
+       "3   The weather today is sunny with a high of 75 ...  \n",
+       "4                                           I'm fine  "
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "stack_exchange_df = pd.read_csv(\"stack_exchange_10k_dataset.csv\", index_col=0)\n",
-    "stack_exchange_df.head()"
+    "my_data_pd = ranking_database.retrieve_all_question_answers_as_pandas()\n",
+    "my_data_pd.head()"
    ]
   },
   {
@@ -182,15 +339,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "My local database has 118 samples\n"
+      "My local database has 60 samples\n"
      ]
     },
     {
      "data": {
       "text/plain": [
        "Dataset({\n",
-       "    features: ['ID', 'Question', 'Answer'],\n",
-       "    num_rows: 118\n",
+       "    features: ['ID', 'Question', 'Up Ranking Answer', 'Low Ranking Answer'],\n",
+       "    num_rows: 60\n",
        "})"
       ]
      },
@@ -202,8 +359,12 @@
    "source": [
     "from datasets import Dataset\n",
     "\n",
-    "print(\"My ranking database has {} samples\".format(stack_exchange_df.shape[0]))\n",
-    "dataset = Dataset.from_dict(stack_exchange_df)\n",
+    "my_data_pd = my_data_pd[[\"ID\",\n",
+    "                        \"Question\",\n",
+    "                        \"Up Ranking Answer\",\n",
+    "                        \"Low Ranking Answer\"]]\n",
+    "print(\"My local database has {} samples\".format(my_data_pd.shape[0]))\n",
+    "dataset = Dataset.from_dict(my_data_pd)\n",
     "dataset"
    ]
   },
@@ -216,55 +377,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading (…)okenizer_config.json: 100%|██████████| 749/749 [00:00<00:00, 5.20MB/s]\n",
-      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 14.1MB/s]\n",
-      "Downloading (…)/main/tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 54.1MB/s]\n",
-      "Downloading (…)cial_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 3.12MB/s]\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/trl/trainer/utils.py:246: UserWarning: The passed formatting_func has more than one argument. Usually that function should have a single argument `example` which corresponds to the dictonnary returned by each element of the dataset. Make sure you know what you are doing.\n",
-      "  warnings.warn(\n"
+      "Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at databricks/dolly-v2-3b and are newly initialized: ['score.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Parameter 'function'=<bound method RewardFinetuning._preprocess_function of <pykoi.rlhf.rw_finetuning.RewardFinetuning object at 0x7fbb3af82950>> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "My local database has 118 samples\n",
-      "Size of the train set: 106.               Size of the validation set: 12\n"
+      "trainable params: 5,248,000 || all params: 2,651,617,280 || trainable%: 0.19791694825582068\n",
+      "My local database has 60 samples for RW finetuning\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading (…)lve/main/config.json: 100%|██████████| 630/630 [00:00<00:00, 4.69MB/s]\n",
-      "Downloading (…)fetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 73.3MB/s]\n",
-      "Downloading (…)of-00002.safetensors: 100%|██████████| 9.98G/9.98G [00:47<00:00, 210MB/s]\n",
-      "Downloading (…)of-00002.safetensors: 100%|██████████| 3.50G/3.50G [00:26<00:00, 130MB/s]\n",
-      "Downloading shards: 100%|██████████| 2/2 [01:14<00:00, 37.34s/it]\n",
-      "Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.71s/it]\n",
-      "Downloading (…)neration_config.json: 100%|██████████| 167/167 [00:00<00:00, 1.25MB/s]\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/peft/utils/other.py:102: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.\n",
-      "  warnings.warn(\n",
-      "Using pad_token, but it is not set yet.\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:159: UserWarning: You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to 1024\n",
-      "  warnings.warn(\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed `packing=True` to the SFTTrainer, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+      "Map:   0%|          | 0/60 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n",
+      "                                                     "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of the train set: 54.                 Size of the validation set: 6\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
       "  warnings.warn(\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2411: UserWarning: `max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.\n",
       "  warnings.warn(\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/trl/trainer/utils.py:268: UserWarning: The dataset reached end and the iterator is reset to the start.\n",
-      "  warnings.warn(\"The dataset reached end and the iterator is reset to the start.\")\n",
-      "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
-      "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
-      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
+      "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n"
      ]
     },
     {
@@ -273,8 +430,8 @@
        "\n",
        "    <div>\n",
        "      \n",
-       "      <progress value='5' max='5' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [5/5 00:32, Epoch 0/1]\n",
+       "      <progress value='10' max='10' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [10/10 00:23, Epoch 5/10]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -282,9 +439,16 @@
        "      <th>Step</th>\n",
        "      <th>Training Loss</th>\n",
        "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>0.821100</td>\n",
+       "      <td>0.495117</td>\n",
+       "      <td>0.666667</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table><p>"
       ],
@@ -294,39 +458,16 @@
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
     }
    ],
    "source": [
     "# run supervised finetuning\n",
-    "config = RLHFConfig(base_model_path=\"elinas/llama-7b-hf-transformers-4.29\", \n",
+    "config = RLHFConfig(base_model_path=\"databricks/dolly-v2-3b\", \n",
     "                          dataset_type=\"local_db\",\n",
-    "                          \n",
     "                          )\n",
-    "rlhf_step1_sft = SupervisedFinetuning(config)\n",
-    "rlhf_step1_sft.train_and_save(\"./models/rlhf_step1_sft\")\n"
+    "rlhf_step2_rft = RewardFinetuning(config)\n",
+    "rlhf_step2_rft.train_and_save(\"./models/rlhf_step2_rw\")\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/example/rlhf/demo_rl.ipynb b/example/rlhf/demo_rl.ipynb
index 7eadde3..5c65b6e 100644
--- a/example/rlhf/demo_rl.ipynb
+++ b/example/rlhf/demo_rl.ipynb
@@ -50,9 +50,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniconda3/envs/koi/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "from pykoi.rlhf import RLHFConfig\n",
     "\n",
@@ -61,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -92,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,7 +121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -302,66 +311,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using pad_token, but it is not set yet.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "My local database has 118 samples\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Map:   0%|          | 0/118 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n",
-      "Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.97s/it]\n",
-      "/opt/conda/envs/pykoi/lib/python3.10/site-packages/peft/utils/other.py:102: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "My local database has 118 samples\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Map:   0%|          | 0/118 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n",
-      "                                                   \r"
-     ]
-    },
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel."
-     ]
-    },
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "# TO DO: accelerate working properly in jupyter notebook\n",
     "from accelerate import notebook_launcher\n",
     "\n",
     "config = RLHFConfig(base_model_path=\"elinas/llama-7b-hf-transformers-4.29\", # \"elinas/llama-7b-hf-transformers-4.29\", \n",
@@ -375,13 +329,6 @@
     "rlhf_step3_rl = RL(config)\n",
     "rlhf_step3_rl.train(\"./models/rlhf_step3_rl\", num_processes=1)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/example/rlhf/demo_rl.py b/example/rlhf/demo_rl.py
index 79e8d2f..16e0eaf 100644
--- a/example/rlhf/demo_rl.py
+++ b/example/rlhf/demo_rl.py
@@ -1,4 +1,6 @@
 """
+huggingface-cli login --token $HUGGINGFACE_TOKEN
+
 accelerate config
 
 LOCAL_DIR=/home/ubuntu/pykoi/pykoi # change this to your local path
@@ -12,12 +14,19 @@
 from pykoi.rlhf import RLHFConfig
 from pykoi.rlhf import RLFinetuning
 
+# Log into huggingface with token if it is not done in the command line.
+# https://huggingface.co/docs/huggingface_hub/quick-start#login
+# https://huggingface.co/settings/tokens
+
+# from huggingface_hub import login
+# login(token="")
+
 
 # use huggingface sft and reward model
 config = RLHFConfig(
     base_model_path="models/rlhf_step1_sft",    #"elinas/llama-7b-hf-transformers-4.29", 
-    dataset_type="huggingface", 
-    dataset_name="goldmermaid/stack_exchange_rank_10k_dataset",
+    dataset_type="local_db", 
+    # dataset_name="goldmermaid/stack_exchange_rank_10k_dataset",
     dataset_subset_rl="data",
     reward_model_path="models/rlhf_step2_rw/", #"cambioml/rlhf_reward_model",
     save_freq=1,
diff --git a/example/rlhf/mlflow/demo_reward_mlflow.ipynb b/example/rlhf/mlflow/demo_reward_mlflow.ipynb
new file mode 100644
index 0000000..4a95208
--- /dev/null
+++ b/example/rlhf/mlflow/demo_reward_mlflow.ipynb
@@ -0,0 +1,600 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %reload_ext autoreload\n",
+    "# %autoreload 2\n",
+    "\n",
+    "# import os\n",
+    "# import sys\n",
+    "\n",
+    "# # Add the root folder to the module search path\n",
+    "# # Get the current directory\n",
+    "# current_directory = os.getcwd()\n",
+    "\n",
+    "# # Move two levels up (go to the parent directory of the parent directory)\n",
+    "# two_levels_up_directory = os.path.dirname(os.path.dirname(current_directory))\n",
+    "\n",
+    "# print(two_levels_up_directory)\n",
+    "\n",
+    "# sys.path.append(two_levels_up_directory)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from pykoi.rlhf import RLHFConfig\n",
+    "from pykoi.rlhf import RewardFinetuning\n",
+    "from pykoi.chat import RankingDatabase\n",
+    "import mlflow\n",
+    "import datetime"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train RLHF using the data from database\n",
+    "\n",
+    "Let's take a look of the QA data and process it for training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>question</th>\n",
+       "      <th>up_ranking_answer</th>\n",
+       "      <th>low_ranking_answer</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>how are u?</td>\n",
+       "      <td>i am fine</td>\n",
+       "      <td>I'm fine</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id                    question   up_ranking_answer  \\\n",
+       "0   1                  who are u?          Unemployed   \n",
+       "1   2  what is the weather today?   it is rainy today   \n",
+       "2   3                  who are u?          Unemployed   \n",
+       "3   4  what is the weather today?   it is rainy today   \n",
+       "4   5                  how are u?           i am fine   \n",
+       "\n",
+       "                                  low_ranking_answer  \n",
+       "0                                         I'm a girl  \n",
+       "1   The weather today is sunny with a high of 75 ...  \n",
+       "2                                         I'm a girl  \n",
+       "3   The weather today is sunny with a high of 75 ...  \n",
+       "4                                           I'm fine  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_root = \"input_rw/\"\n",
+    "input_file = \"ranking.csv\"\n",
+    "stack_exchange_df = pd.read_csv(input_root + input_file)\n",
+    "stack_exchange_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "My ranking database has 30 samples\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'question', 'up_ranking_answer', 'low_ranking_answer'],\n",
+       "    num_rows: 30\n",
+       "})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "print(\"My ranking database has {} samples\".format(stack_exchange_df.shape[0]))\n",
+    "dataset = Dataset.from_dict(stack_exchange_df)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RANKING_CSV_HEADER_ID = 'id'\n",
+    "RANKING_CSV_HEADER_QUESTION = 'question'\n",
+    "RANKING_CSV_HEADER_UP_RANKING_ANSWER = 'up_ranking_answer'\n",
+    "RANKING_CSV_HEADER_LOW_RANKING_ANSWER = 'low_ranking_answer'\n",
+    "RANKING_CSV_HEADER = (\n",
+    "    RANKING_CSV_HEADER_ID,\n",
+    "    RANKING_CSV_HEADER_QUESTION,\n",
+    "    RANKING_CSV_HEADER_UP_RANKING_ANSWER,\n",
+    "    RANKING_CSV_HEADER_LOW_RANKING_ANSWER\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pykoi.chat.db.ranking_database.RankingDatabase at 0x7f4a491569b0>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ranking_database = RankingDatabase()\n",
+    "for row in stack_exchange_df.iloc[:].to_dict('records'):\n",
+    "    ranking_id = ranking_database.insert_ranking(question=row[RANKING_CSV_HEADER_QUESTION],\n",
+    "                                       up_ranking_answer=row[RANKING_CSV_HEADER_UP_RANKING_ANSWER],\n",
+    "                                       low_ranking_answer=row[RANKING_CSV_HEADER_LOW_RANKING_ANSWER]\n",
+    "                                       )\n",
+    "ranking_database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>Question</th>\n",
+       "      <th>Up Ranking Answer</th>\n",
+       "      <th>Low Ranking Answer</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>who are u?</td>\n",
+       "      <td>Unemployed</td>\n",
+       "      <td>I'm a girl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>what is the weather today?</td>\n",
+       "      <td>it is rainy today</td>\n",
+       "      <td>The weather today is sunny with a high of 75 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>how are u?</td>\n",
+       "      <td>i am fine</td>\n",
+       "      <td>I'm fine</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ID                    Question   Up Ranking Answer  \\\n",
+       "0   1                  who are u?          Unemployed   \n",
+       "1   2  what is the weather today?   it is rainy today   \n",
+       "2   3                  who are u?          Unemployed   \n",
+       "3   4  what is the weather today?   it is rainy today   \n",
+       "4   5                  how are u?           i am fine   \n",
+       "\n",
+       "                                  Low Ranking Answer  \n",
+       "0                                         I'm a girl  \n",
+       "1   The weather today is sunny with a high of 75 ...  \n",
+       "2                                         I'm a girl  \n",
+       "3   The weather today is sunny with a high of 75 ...  \n",
+       "4                                           I'm fine  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "my_data_pd = ranking_database.retrieve_all_question_answers_as_pandas()\n",
+    "my_data_pd.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "My local database has 30 samples\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['ID', 'Question', 'Up Ranking Answer', 'Low Ranking Answer'],\n",
+       "    num_rows: 30\n",
+       "})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "my_data_pd = my_data_pd[[\"ID\",\n",
+    "                        \"Question\",\n",
+    "                        \"Up Ranking Answer\",\n",
+    "                        \"Low Ranking Answer\"]]\n",
+    "print(\"My local database has {} samples\".format(my_data_pd.shape[0]))\n",
+    "dataset = Dataset.from_dict(my_data_pd)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train with RLHF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/07 20:22:44 INFO mlflow.tracking.fluent: Experiment with name 'rlhf_step2_rw/2023-10-07 20:22:44.807003' does not exist. Creating a new experiment.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<Experiment: artifact_location='file:///home/ubuntu/pykoi/example/rlhf/mlflow/mlruns/556506016264966830', creation_time=1696710164821, experiment_id='556506016264966830', last_update_time=1696710164821, lifecycle_stage='active', name='rlhf_step2_rw/2023-10-07 20:22:44.807003', tags={}>"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Set up mlflow experiment name.\n",
+    "\n",
+    "# mlflow.set_tracking_uri(\"http://x.x.x.x:5000\")\n",
+    "experiment = \"rlhf_step2_rw\"\n",
+    "current_time = str(datetime.datetime.now())\n",
+    "mlflow_experiment_name = '/'.join([experiment, current_time])\n",
+    "\n",
+    "try:\n",
+    "    mlflow.end_run()\n",
+    "except:\n",
+    "    print(\"No mlflow run in progress\")\n",
+    "\n",
+    "mlflow.set_experiment(mlflow_experiment_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set pykoi parameters.\n",
+    "reward_model_path = \"databricks/dolly-v2-3b\"\n",
+    "# reward_model_path = \"elinas/llama-7b-hf-transformers-4.29\"\n",
+    "dataset_type = \"local_db\"\n",
+    "trained_model_path = \"./models/rlhf_step2_rw\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'./models/rlhf_step2_rw'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Manually log pykoi parameters into mlflow. Torch level parameters are automatically logged.\n",
+    "mlflow.log_param(\"pykoi_reward_model_path\", reward_model_path)\n",
+    "mlflow.log_param(\"pykoi_dataset_type\", dataset_type)\n",
+    "mlflow.log_param(\"pykoi_trained_model_path\", trained_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at databricks/dolly-v2-3b and are newly initialized: ['score.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Parameter 'function'=<bound method RewardFinetuning._preprocess_function of <pykoi.rlhf.rw_finetuning.RewardFinetuning object at 0x7f4a48ab5870>> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 5,248,000 || all params: 2,651,617,280 || trainable%: 0.19791694825582068\n",
+      "My local database has 30 samples for RW finetuning\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map:   0%|          | 0/30 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n",
+      "                                                     "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of the train set: 27.                 Size of the validation set: 3\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2411: UserWarning: `max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.\n",
+      "  warnings.warn(\n",
+      "Could not estimate the number of tokens of the input, floating-point operations will not be computed\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='10' max='10' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [10/10 00:22, Epoch 10/10]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>0.806100</td>\n",
+       "      <td>1.217773</td>\n",
+       "      <td>0.333333</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Run supervised finetuning\n",
+    "# Training metrics are automatically logged into mlflow.\n",
+    "config = RLHFConfig(reward_model_path=reward_model_path, \n",
+    "                          dataset_type=dataset_type,\n",
+    "                          )\n",
+    "rlhf_step2_rft = RewardFinetuning(config)\n",
+    "rlhf_step2_rft.train_and_save(trained_model_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the trained reward model and input into mlflow artifacts.\n",
+    "mlflow.log_artifacts(trained_model_path)\n",
+    "mlflow.log_artifacts(input_root)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the terminal, run\n",
+    "```\n",
+    "mlflow ui\n",
+    "```\n",
+    "and go to http://127.0.0.1:5000 in the browser to view the experiment in the UI."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pykoi",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/example/rlhf/mlflow/demo_rl_mlflow.py b/example/rlhf/mlflow/demo_rl_mlflow.py
new file mode 100644
index 0000000..d5d3ff9
--- /dev/null
+++ b/example/rlhf/mlflow/demo_rl_mlflow.py
@@ -0,0 +1,75 @@
+"""
+huggingface-cli login --token $HUGGINGFACE_TOKEN
+
+accelerate config
+
+LOCAL_DIR=/home/ubuntu/pykoi # change this to your local path
+
+export PYTHONPATH=$PYTHONPATH:${LOCAL_DIR}
+
+accelerate launch --num_machines 1  --num_processes 1 --mixed_precision fp16 ${LOCAL_DIR}/example/rlhf/mlflow/demo_rl_mlflow.py
+"""
+# accelerate launch --num_machines 1  --num_processes 1 --mixed_precision fp16 example/rlhf/mlflow/demo_rl_mlflow.py
+
+from pykoi.rlhf import RLHFConfig
+from pykoi.rlhf import RLFinetuning
+import mlflow
+import datetime
+
+# Log into huggingface with token if it is not done in the command line.
+# https://huggingface.co/docs/huggingface_hub/quick-start#login
+# https://huggingface.co/settings/tokens
+
+# from huggingface_hub import login
+# login(token="")
+
+# Set up mlflow experiment name.
+mlflow.set_tracking_uri("example/rlhf/mlflow/mlruns")
+experiment = "rlhf_step3_rl"
+current_time = str(datetime.datetime.now())
+mlflow_experiment_name = '/'.join([experiment, current_time])
+mlflow.set_experiment(mlflow_experiment_name)
+
+# Set pykoi parameters.
+base_model_path = "example/rlhf/mlflow/models/rlhf_step1_sft"
+dataset_type = "local_db"
+reward_model_path = "example/rlhf/mlflow/models/rlhf_step2_rw"
+dataset_subset_rl = "data"
+save_freq = 1
+ppo_batch_size = 32
+ppo_epochs = 4
+total_epochs = 5
+output_dir = "example/rlhf/mlflow/models/rlhf_step3_rl"
+
+# Manually log pykoi parameters into mlflow. Torch level parameters are automatically logged.
+mlflow.log_param("pykoi_base_model_path", base_model_path)
+mlflow.log_param("pykoi_dataset_type", dataset_type)
+mlflow.log_param("pykoi_reward_model_path", reward_model_path)
+mlflow.log_param("pykoi_dataset_subset_rl", dataset_subset_rl)
+mlflow.log_param("pykoi_save_freq", save_freq)
+mlflow.log_param("pykoi_ppo_batch_size", ppo_batch_size)
+mlflow.log_param("pykoi_ppo_epochs", ppo_epochs)
+mlflow.log_param("pykoi_total_epochs", total_epochs)
+mlflow.log_param("pykoi_output_dir", output_dir)
+
+# Use huggingface sft and reward model
+# Training metrics are automatically logged into mlflow.
+config = RLHFConfig(
+    base_model_path=base_model_path,    #"elinas/llama-7b-hf-transformers-4.29", 
+    dataset_type=dataset_type,
+    # dataset_type="huggingface", 
+    # dataset_name="goldmermaid/stack_exchange_rank_10k_dataset",
+    dataset_subset_rl=dataset_subset_rl,
+    reward_model_path=reward_model_path, #"cambioml/rlhf_reward_model",
+    save_freq=save_freq,
+    ppo_batch_size=ppo_batch_size,
+    ppo_epochs=ppo_epochs,
+    total_epochs=total_epochs,
+    output_dir=output_dir,
+)
+
+rlhf_step3_rl = RLFinetuning(config)
+rlhf_step3_rl.train_and_save(output_dir)
+
+# Save the trained model into mlflow artifacts.
+mlflow.log_artifacts(output_dir)
\ No newline at end of file
diff --git a/example/rlhf/mlflow/demo_rw_finetuning_mlflow.py b/example/rlhf/mlflow/demo_rw_finetuning_mlflow.py
new file mode 100644
index 0000000..3ffcc69
--- /dev/null
+++ b/example/rlhf/mlflow/demo_rw_finetuning_mlflow.py
@@ -0,0 +1,53 @@
+"""Demo for the reward fine tuning.
+
+python -m example.rlhf.demo_rw_finetuning_mlfow
+"""
+
+from pykoi.rlhf import RLHFConfig
+from pykoi.rlhf import RewardFinetuning
+from pykoi.chat import RankingDatabase
+import mlflow
+import datetime
+
+from pykoi.chat.db.constants import (
+    RANKING_CSV_HEADER_ID,
+    RANKING_CSV_HEADER_QUESTION,
+    RANKING_CSV_HEADER_UP_RANKING_ANSWER,
+    RANKING_CSV_HEADER_LOW_RANKING_ANSWER)
+
+# get data from local database
+ranking_database = RankingDatabase()
+my_data_pd = ranking_database.retrieve_all_question_answers_as_pandas()
+my_data_pd = my_data_pd[[
+    RANKING_CSV_HEADER_ID,
+    RANKING_CSV_HEADER_QUESTION,
+    RANKING_CSV_HEADER_UP_RANKING_ANSWER,
+    RANKING_CSV_HEADER_LOW_RANKING_ANSWER]]
+
+# analyze the data
+print(my_data_pd)
+print("My local database has {} samples in total".format(my_data_pd.shape[0]))
+
+# Set up mlflow experiment name.
+# mlflow.set_tracking_uri("http://x.x.x.x:5000")
+experiment = "rlhf_step2_rw"
+current_time = str(datetime.datetime.now())
+mlflow_experiment_name = '/'.join([experiment, current_time])
+mlflow.set_experiment(mlflow_experiment_name)
+
+# Set pykoi parameters.
+reward_model_path = "databricks/dolly-v2-3b"
+trained_model_path = "./models/rlhf_step2_rw"
+
+# Manually log pykoi parameters into mlflow. Torch level parameters are automatically logged.
+mlflow.log_param("pykoi_reward_model_path", reward_model_path)
+mlflow.log_param("pykoi_trained_model_path", trained_model_path)
+
+# run reward model finetuning
+# config = RLHFConfig(dataset_type="local_db")
+config = RLHFConfig(reward_model_path=reward_model_path)
+rlhf_step2_rft = RewardFinetuning(config)
+rlhf_step2_rft.train_and_save(trained_model_path)
+
+# Save the trained reward model into mlflow artifacts.
+mlflow.log_artifacts(trained_model_path)
diff --git a/example/rlhf/mlflow/demo_supervised_finetuning_mlflow.ipynb b/example/rlhf/mlflow/demo_supervised_finetuning_mlflow.ipynb
new file mode 100644
index 0000000..3313894
--- /dev/null
+++ b/example/rlhf/mlflow/demo_supervised_finetuning_mlflow.ipynb
@@ -0,0 +1,882 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %reload_ext autoreload\n",
+    "# %autoreload 2\n",
+    "\n",
+    "# import os\n",
+    "# import sys\n",
+    "\n",
+    "# # Add the root folder to the module search path\n",
+    "# # Get the current directory\n",
+    "# current_directory = os.getcwd()\n",
+    "\n",
+    "# # Move two levels up (go to the parent directory of the parent directory)\n",
+    "# two_levels_up_directory = os.path.dirname(os.path.dirname(current_directory))\n",
+    "\n",
+    "# print(two_levels_up_directory)\n",
+    "\n",
+    "# sys.path.append(two_levels_up_directory)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pykoi.chat import QuestionAnswerDatabase\n",
+    "from pykoi.rlhf import RLHFConfig\n",
+    "from pykoi.rlhf import SupervisedFinetuning\n",
+    "import mlflow\n",
+    "import datetime"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define my DB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qa_database = QuestionAnswerDatabase()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Insert my data to the DB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Question</th>\n",
+       "      <th>Answer</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>What is InstructGPT?</td>\n",
+       "      <td>InstructGPT is a language model developed by O...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Why does InstructGPT work?</td>\n",
+       "      <td>InstructGPT works due to a two-step training p...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>What are some commonly used evaluation metrics...</td>\n",
+       "      <td>One main evaluation metric for InstructGPT is ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>How is InstructGPT used?</td>\n",
+       "      <td>InstructGPT can be used in any application tha...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>What are some common applications of InstructGPT?</td>\n",
+       "      <td>Common applications of InstructGPT can be in e...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>How does InstructGPT handle ambiguous prompts?</td>\n",
+       "      <td>For ambiguous prompts, InstructGPT aims to ask...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Can InstructGPT generate incorrect or nonsensi...</td>\n",
+       "      <td>Yes, InstructGPT can sometimes produce plausib...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>How does InstructGPT manage harmful and biased...</td>\n",
+       "      <td>InstructGPT has a moderation system in place t...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>What is the role of human evaluators in the tr...</td>\n",
+       "      <td>Human evaluators play a crucial role in the tr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>What are the limitations of InstructGPT?</td>\n",
+       "      <td>There are several limitations to InstructGPT. ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>What is reinforcement learning with human feed...</td>\n",
+       "      <td>Reinforcement learning with human feedback is ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>How is human feedback integrated into the rein...</td>\n",
+       "      <td>Human feedback can be integrated into reinforc...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>What are the advantages of reinforcement learn...</td>\n",
+       "      <td>Reinforcement learning with human feedback can...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>What are some challenges of reinforcement lear...</td>\n",
+       "      <td>One of the main challenges is the potential fo...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Can you give an example of an application of r...</td>\n",
+       "      <td>One potential application of reinforcement lea...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>What is 'step 1 supervised finetuning' in rein...</td>\n",
+       "      <td>'Step 1 supervised finetuning' in reinforcemen...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Why does step 1 supervised finetuning work in ...</td>\n",
+       "      <td>Step 1 supervised finetuning works in reinforc...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>What are some commonly used evaluation metrics...</td>\n",
+       "      <td>Common evaluation metrics in supervised finetu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>How is step 1 supervised finetuning used in re...</td>\n",
+       "      <td>In step 1 supervised finetuning, a model is fi...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>What are some common applications of step 1 su...</td>\n",
+       "      <td>Step 1 supervised finetuning can be used in a ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>What is the role of human demonstrations in st...</td>\n",
+       "      <td>Human demonstrations play a crucial role in st...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>How is the training data collected for step 1 ...</td>\n",
+       "      <td>The training data for step 1 supervised finetu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>What are the limitations of step 1 supervised ...</td>\n",
+       "      <td>While supervised finetuning can provide a stro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>How does step 1 supervised finetuning fit into...</td>\n",
+       "      <td>Step 1 supervised finetuning is typically the ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>What are the prerequisites for using step 1 su...</td>\n",
+       "      <td>To use step 1 supervised finetuning in reinfor...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>What is 'reward modeling' in reinforcement lea...</td>\n",
+       "      <td>Reward modeling is the second step in reinforc...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>Why does reward modeling work in reinforcement...</td>\n",
+       "      <td>Reward modeling works because it bridges the g...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>What are some commonly used evaluation metrics...</td>\n",
+       "      <td>In reward modeling, one common evaluation metr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>How is reward modeling used in reinforcement l...</td>\n",
+       "      <td>In reward modeling, a model is trained on a da...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>What are some common applications of reward mo...</td>\n",
+       "      <td>Reward modeling can be applied in any scenario...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>What is the role of human feedback in reward m...</td>\n",
+       "      <td>Human feedback plays a critical role in reward...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>How is the training data collected for reward ...</td>\n",
+       "      <td>The training data for reward modeling is typic...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>What are the limitations of reward modeling?</td>\n",
+       "      <td>One limitation of reward modeling is that it r...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>How does reward modeling fit into the broader ...</td>\n",
+       "      <td>Reward modeling is typically the second step i...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>What are the prerequisites for using reward mo...</td>\n",
+       "      <td>To use reward modeling in reinforcement learni...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>What is training a reinforcement learning (RL)...</td>\n",
+       "      <td>Training an RL model in this context refers to...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Why does training an RL model work in reinforc...</td>\n",
+       "      <td>Training an RL model works because it allows t...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>What are some commonly used evaluation metrics...</td>\n",
+       "      <td>The most common evaluation metric in training ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>How is training an RL model used in reinforcem...</td>\n",
+       "      <td>In training an RL model, the model interacts w...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>What are some common applications of training ...</td>\n",
+       "      <td>Training an RL model can be used in a wide ran...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>How can loss explosion be avoided during the t...</td>\n",
+       "      <td>Loss explosion can be avoided through a variet...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>What are common failures in training an RL model?</td>\n",
+       "      <td>Common failures in training an RL model includ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
+       "      <td>What are the roles of exploration and exploita...</td>\n",
+       "      <td>Exploration and exploitation play key roles in...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43</th>\n",
+       "      <td>How does training an RL model fit into the bro...</td>\n",
+       "      <td>Training an RL model is typically the third st...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44</th>\n",
+       "      <td>What are the prerequisites for training an RL ...</td>\n",
+       "      <td>To train an RL model in reinforcement learning...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             Question  \\\n",
+       "0                                What is InstructGPT?   \n",
+       "1                          Why does InstructGPT work?   \n",
+       "2   What are some commonly used evaluation metrics...   \n",
+       "3                            How is InstructGPT used?   \n",
+       "4   What are some common applications of InstructGPT?   \n",
+       "5      How does InstructGPT handle ambiguous prompts?   \n",
+       "6   Can InstructGPT generate incorrect or nonsensi...   \n",
+       "7   How does InstructGPT manage harmful and biased...   \n",
+       "8   What is the role of human evaluators in the tr...   \n",
+       "9            What are the limitations of InstructGPT?   \n",
+       "10  What is reinforcement learning with human feed...   \n",
+       "11  How is human feedback integrated into the rein...   \n",
+       "12  What are the advantages of reinforcement learn...   \n",
+       "13  What are some challenges of reinforcement lear...   \n",
+       "14  Can you give an example of an application of r...   \n",
+       "15  What is 'step 1 supervised finetuning' in rein...   \n",
+       "16  Why does step 1 supervised finetuning work in ...   \n",
+       "17  What are some commonly used evaluation metrics...   \n",
+       "18  How is step 1 supervised finetuning used in re...   \n",
+       "19  What are some common applications of step 1 su...   \n",
+       "20  What is the role of human demonstrations in st...   \n",
+       "21  How is the training data collected for step 1 ...   \n",
+       "22  What are the limitations of step 1 supervised ...   \n",
+       "23  How does step 1 supervised finetuning fit into...   \n",
+       "24  What are the prerequisites for using step 1 su...   \n",
+       "25  What is 'reward modeling' in reinforcement lea...   \n",
+       "26  Why does reward modeling work in reinforcement...   \n",
+       "27  What are some commonly used evaluation metrics...   \n",
+       "28  How is reward modeling used in reinforcement l...   \n",
+       "29  What are some common applications of reward mo...   \n",
+       "30  What is the role of human feedback in reward m...   \n",
+       "31  How is the training data collected for reward ...   \n",
+       "32       What are the limitations of reward modeling?   \n",
+       "33  How does reward modeling fit into the broader ...   \n",
+       "34  What are the prerequisites for using reward mo...   \n",
+       "35  What is training a reinforcement learning (RL)...   \n",
+       "36  Why does training an RL model work in reinforc...   \n",
+       "37  What are some commonly used evaluation metrics...   \n",
+       "38  How is training an RL model used in reinforcem...   \n",
+       "39  What are some common applications of training ...   \n",
+       "40  How can loss explosion be avoided during the t...   \n",
+       "41  What are common failures in training an RL model?   \n",
+       "42  What are the roles of exploration and exploita...   \n",
+       "43  How does training an RL model fit into the bro...   \n",
+       "44  What are the prerequisites for training an RL ...   \n",
+       "\n",
+       "                                               Answer  \n",
+       "0   InstructGPT is a language model developed by O...  \n",
+       "1   InstructGPT works due to a two-step training p...  \n",
+       "2   One main evaluation metric for InstructGPT is ...  \n",
+       "3   InstructGPT can be used in any application tha...  \n",
+       "4   Common applications of InstructGPT can be in e...  \n",
+       "5   For ambiguous prompts, InstructGPT aims to ask...  \n",
+       "6   Yes, InstructGPT can sometimes produce plausib...  \n",
+       "7   InstructGPT has a moderation system in place t...  \n",
+       "8   Human evaluators play a crucial role in the tr...  \n",
+       "9   There are several limitations to InstructGPT. ...  \n",
+       "10  Reinforcement learning with human feedback is ...  \n",
+       "11  Human feedback can be integrated into reinforc...  \n",
+       "12  Reinforcement learning with human feedback can...  \n",
+       "13  One of the main challenges is the potential fo...  \n",
+       "14  One potential application of reinforcement lea...  \n",
+       "15  'Step 1 supervised finetuning' in reinforcemen...  \n",
+       "16  Step 1 supervised finetuning works in reinforc...  \n",
+       "17  Common evaluation metrics in supervised finetu...  \n",
+       "18  In step 1 supervised finetuning, a model is fi...  \n",
+       "19  Step 1 supervised finetuning can be used in a ...  \n",
+       "20  Human demonstrations play a crucial role in st...  \n",
+       "21  The training data for step 1 supervised finetu...  \n",
+       "22  While supervised finetuning can provide a stro...  \n",
+       "23  Step 1 supervised finetuning is typically the ...  \n",
+       "24  To use step 1 supervised finetuning in reinfor...  \n",
+       "25  Reward modeling is the second step in reinforc...  \n",
+       "26  Reward modeling works because it bridges the g...  \n",
+       "27  In reward modeling, one common evaluation metr...  \n",
+       "28  In reward modeling, a model is trained on a da...  \n",
+       "29  Reward modeling can be applied in any scenario...  \n",
+       "30  Human feedback plays a critical role in reward...  \n",
+       "31  The training data for reward modeling is typic...  \n",
+       "32  One limitation of reward modeling is that it r...  \n",
+       "33  Reward modeling is typically the second step i...  \n",
+       "34  To use reward modeling in reinforcement learni...  \n",
+       "35  Training an RL model in this context refers to...  \n",
+       "36  Training an RL model works because it allows t...  \n",
+       "37  The most common evaluation metric in training ...  \n",
+       "38  In training an RL model, the model interacts w...  \n",
+       "39  Training an RL model can be used in a wide ran...  \n",
+       "40  Loss explosion can be avoided through a variet...  \n",
+       "41  Common failures in training an RL model includ...  \n",
+       "42  Exploration and exploitation play key roles in...  \n",
+       "43  Training an RL model is typically the third st...  \n",
+       "44  To train an RL model in reinforcement learning...  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "input_root = \"input_sft/\"\n",
+    "input_file = \"rlhf_qa_dataset.json\"\n",
+    "my_stackoverflow_dataset = pd.read_json(input_root + input_file, orient=\"records\")\n",
+    "my_stackoverflow_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "QA_CSV_HEADER_ID = 'ID'\n",
+    "QA_CSV_HEADER_QUESTION = 'Question'\n",
+    "QA_CSV_HEADER_ANSWER = 'Answer'\n",
+    "QA_CSV_HEADER_VOTE_STATUS = 'Vote Status'\n",
+    "QA_CSV_HEADER_TIMESTAMPS = 'Timestamp'\n",
+    "QA_CSV_HEADER = (\n",
+    "    QA_CSV_HEADER_ID,\n",
+    "    QA_CSV_HEADER_QUESTION,\n",
+    "    QA_CSV_HEADER_ANSWER,\n",
+    "    QA_CSV_HEADER_VOTE_STATUS,\n",
+    "    QA_CSV_HEADER_TIMESTAMPS\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for row in my_stackoverflow_dataset.iloc[3:100].to_dict('records'):\n",
+    "    qa_id = qa_database.insert_question_answer(question=row[QA_CSV_HEADER_QUESTION],\n",
+    "                                       answer=row[QA_CSV_HEADER_ANSWER])\n",
+    "    qa_database.update_vote_status(id=qa_id, vote_status=\"up\") #row[QA_CSV_HEADER_VOTE_STATUS])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pykoi.chat.db.qa_database.QuestionAnswerDatabase at 0x7f0f203ce7a0>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qa_database"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train RLHF using the data from database\n",
+    "\n",
+    "Let's take a look of the QA data and process it for training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>Question</th>\n",
+       "      <th>Answer</th>\n",
+       "      <th>Vote Status</th>\n",
+       "      <th>Timestamp</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>How is InstructGPT used?</td>\n",
+       "      <td>InstructGPT can be used in any application tha...</td>\n",
+       "      <td>up</td>\n",
+       "      <td>2023-10-07 20:10:04.423359</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>What are some common applications of InstructGPT?</td>\n",
+       "      <td>Common applications of InstructGPT can be in e...</td>\n",
+       "      <td>up</td>\n",
+       "      <td>2023-10-07 20:10:04.432908</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>How does InstructGPT handle ambiguous prompts?</td>\n",
+       "      <td>For ambiguous prompts, InstructGPT aims to ask...</td>\n",
+       "      <td>up</td>\n",
+       "      <td>2023-10-07 20:10:04.439999</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Can InstructGPT generate incorrect or nonsensi...</td>\n",
+       "      <td>Yes, InstructGPT can sometimes produce plausib...</td>\n",
+       "      <td>up</td>\n",
+       "      <td>2023-10-07 20:10:04.446747</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>How does InstructGPT manage harmful and biased...</td>\n",
+       "      <td>InstructGPT has a moderation system in place t...</td>\n",
+       "      <td>up</td>\n",
+       "      <td>2023-10-07 20:10:04.453973</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ID                                           Question  \\\n",
+       "0   1                           How is InstructGPT used?   \n",
+       "1   2  What are some common applications of InstructGPT?   \n",
+       "2   3     How does InstructGPT handle ambiguous prompts?   \n",
+       "3   4  Can InstructGPT generate incorrect or nonsensi...   \n",
+       "4   5  How does InstructGPT manage harmful and biased...   \n",
+       "\n",
+       "                                              Answer Vote Status  \\\n",
+       "0  InstructGPT can be used in any application tha...          up   \n",
+       "1  Common applications of InstructGPT can be in e...          up   \n",
+       "2  For ambiguous prompts, InstructGPT aims to ask...          up   \n",
+       "3  Yes, InstructGPT can sometimes produce plausib...          up   \n",
+       "4  InstructGPT has a moderation system in place t...          up   \n",
+       "\n",
+       "                    Timestamp  \n",
+       "0  2023-10-07 20:10:04.423359  \n",
+       "1  2023-10-07 20:10:04.432908  \n",
+       "2  2023-10-07 20:10:04.439999  \n",
+       "3  2023-10-07 20:10:04.446747  \n",
+       "4  2023-10-07 20:10:04.453973  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "my_data_pd = qa_database.retrieve_all_question_answers_as_pandas()\n",
+    "my_data_pd.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(42, 5)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "my_data_pd = my_data_pd[my_data_pd[QA_CSV_HEADER_VOTE_STATUS]==\"up\"]\n",
+    "my_data_pd.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "My local database has 42 samples\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['ID', 'Question', 'Answer'],\n",
+       "    num_rows: 42\n",
+       "})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "my_data_pd = my_data_pd[[QA_CSV_HEADER_ID,\n",
+    "                        QA_CSV_HEADER_QUESTION,\n",
+    "                        QA_CSV_HEADER_ANSWER]]\n",
+    "print(\"My local database has {} samples\".format(my_data_pd.shape[0]))\n",
+    "dataset = Dataset.from_dict(my_data_pd)\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train with RLHF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/07 20:10:04 INFO mlflow.tracking.fluent: Experiment with name 'rlhf_step1_sft/2023-10-07 20:10:04.803036' does not exist. Creating a new experiment.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<Experiment: artifact_location='file:///home/ubuntu/pykoi/example/rlhf/mlflow/mlruns/960207718741993136', creation_time=1696709404823, experiment_id='960207718741993136', last_update_time=1696709404823, lifecycle_stage='active', name='rlhf_step1_sft/2023-10-07 20:10:04.803036', tags={}>"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Set up mlflow experiment name.\n",
+    "\n",
+    "# mlflow.set_tracking_uri(\"http://x.x.x.x:5000\")\n",
+    "experiment = \"rlhf_step1_sft\"\n",
+    "current_time = str(datetime.datetime.now())\n",
+    "mlflow_experiment_name = '/'.join([experiment, current_time])\n",
+    "\n",
+    "try:\n",
+    "    mlflow.end_run()\n",
+    "except:\n",
+    "    print(\"No mlflow run in progress\")\n",
+    "\n",
+    "mlflow.set_experiment(mlflow_experiment_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set pykoi parameters.\n",
+    "base_model_path = \"elinas/llama-7b-hf-transformers-4.29\"\n",
+    "dataset_type = \"local_db\"\n",
+    "log_freq = 1\n",
+    "max_steps = 5\n",
+    "peft_model_path = \"./models/rlhf_step1_sft\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'./models/rlhf_step1_sft'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Manually log pykoi parameters into mlflow. Torch level parameters are automatically logged.\n",
+    "mlflow.log_param(\"pykoi_base_model_path\", base_model_path)\n",
+    "mlflow.log_param(\"pykoi_dataset_type\", dataset_type)\n",
+    "mlflow.log_param(\"pykoi_log_freq\", log_freq)\n",
+    "mlflow.log_param(\"pykoi_max_steps\", max_steps)\n",
+    "mlflow.log_param(\"pykoi_peft_model_path\", peft_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "My local database has 42 up vote samples for SFT\n",
+      "Size of the train set: 37.               Size of the validation set: 5\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/trl/trainer/utils.py:246: UserWarning: The passed formatting_func has more than one argument. Usually that function should have a single argument `example` which corresponds to the dictonnary returned by each element of the dataset. Make sure you know what you are doing.\n",
+      "  warnings.warn(\n",
+      "Loading checkpoint shards: 100%|██████████| 2/2 [01:42<00:00, 51.39s/it]\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/peft/utils/other.py:122: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.\n",
+      "  warnings.warn(\n",
+      "Using pad_token, but it is not set yet.\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:159: UserWarning: You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to 1024\n",
+      "  warnings.warn(\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed `packing=True` to the SFTTrainer, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached.\n",
+      "  warnings.warn(\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/trl/trainer/utils.py:268: UserWarning: The dataset reached end and the iterator is reset to the start.\n",
+      "  warnings.warn(\"The dataset reached end and the iterator is reset to the start.\")\n",
+      "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "  warnings.warn(\n",
+      "/home/ubuntu/miniconda3/envs/pykoi/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5' max='5' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5/5 01:46, Epoch 4/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Run supervised finetuning\n",
+    "# Training metrics are automatically logged into mlflow.\n",
+    "config = RLHFConfig(\n",
+    "    base_model_path=base_model_path, \n",
+    "    dataset_type=dataset_type,\n",
+    "    max_steps=max_steps,\n",
+    "    log_freq=log_freq\n",
+    "    )\n",
+    "rlhf_step1_sft = SupervisedFinetuning(config)\n",
+    "rlhf_step1_sft.train_and_save(peft_model_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the trained peft model and input into mlflow artifacts.\n",
+    "mlflow.log_artifacts(peft_model_path)\n",
+    "mlflow.log_artifacts(input_root)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the terminal, run\n",
+    "```\n",
+    "mlflow ui\n",
+    "```\n",
+    "and go to http://127.0.0.1:5000 in the browser to view the experiment in the UI."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pykoi",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/example/rlhf/mlflow/input_rw/ranking.csv b/example/rlhf/mlflow/input_rw/ranking.csv
new file mode 100644
index 0000000..dea9695
--- /dev/null
+++ b/example/rlhf/mlflow/input_rw/ranking.csv
@@ -0,0 +1,31 @@
+id,question,up_ranking_answer,low_ranking_answer
+1,who are u?, Unemployed, I'm a girl
+2,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+3,who are u?, Unemployed, I'm a girl
+4,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+5,how are u?, i am fine, I'm fine
+6,what is the roda?, The roda is a circle dance performed by a group of people who are called the roda., the roda is the wheel of the law that the buddha turned when he was teaching the dharma
+7,what is the roda?, the roda is the wheel of the law that the buddha turned when he was teaching the dharma, The roda is a circle dance performed by a group of people who are called the roda.
+8,what is the roda?, The roda is a circle dance performed by a group of people who are called the roda., the roda is the wheel of the law that the buddha turned when he was teaching the dharma
+9,what is the roda?, the roda is the wheel of the law that the buddha turned when he was teaching the dharma, The roda is a circle dance performed by a group of people who are called the roda.
+10,what is the roda?, The roda is a circle dance performed by a group of people who are called the roda., the roda is the wheel of the law that the buddha turned when he was teaching the dharma
+11,what is the roda?, the roda is the wheel of the law that the buddha turned when he was teaching the dharma, The roda is a circle dance performed by a group of people who are called the roda.
+12,what is the weather today?, The weather today is sunny with a high of 75 degrees., it is rainy today
+13,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+14,who are u?, I'm a girl, Unemployed
+15,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+16,what is the weather today?, The weather today is sunny with a high of 75 degrees., it is rainy today
+17,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+18,how are u?, i am fine, I'm fine
+19,how are u?, I'm fine, i am fine
+20,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+21,who is Jojo?, Jojo is a guy who has a blog on Tumblr and a YouTube channel where he vlogs about his life. He's a pretty big deal on the internet., Jojo is the narrator of the story.
+22,who is Jojo?, Jojo is a guy who has a blog on Tumblr and a YouTube channel where he vlogs about his life. He's a pretty big deal on the internet., Jojo is the narrator of the story.
+23,who is Jojo?, Jojo is a guy who has a blog on Tumblr and a YouTube channel where he vlogs about his life. He's a pretty big deal on the internet., Jojo is the narrator of the story.
+24,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+25,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+26,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+27,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+28,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+29,who is Kimi?, Kimi is the girl that is in the video and is a sex slave to the Rothschilds., Kimi is a very talented driver but he's not the fastest driver out there. He's not the best driver out there. He's not the driver who is always fastest in qualifying. He's not the driver who is always fastest in the race. He's not the driver who is always the best in the wet. He's not the driver who is always the best in the dry. He's not the driver who is always the best overtaker. He's not the driver who is always
+30,who is Kimi?, Kimi is a very talented driver but he's not the fastest driver out there. He's not the best driver out there. He's not the driver who is always fastest in qualifying. He's not the driver who is always fastest in the race. He's not the driver who is always the best in the wet. He's not the driver who is always the best in the dry. He's not the driver who is always the best overtaker. He's not the driver who is always, Kimi is the girl that is in the video and is a sex slave to the Rothschilds.
diff --git a/example/rlhf/mlflow/input_sft/rlhf_qa_dataset.json b/example/rlhf/mlflow/input_sft/rlhf_qa_dataset.json
new file mode 100644
index 0000000..945a9e9
--- /dev/null
+++ b/example/rlhf/mlflow/input_sft/rlhf_qa_dataset.json
@@ -0,0 +1,182 @@
+[
+  {
+    "Question": "What is InstructGPT?",
+    "Answer": "InstructGPT is a language model developed by OpenAI that is trained to follow instructions in a text prompt given by a human. It is trained using reinforcement learning from human feedback, with a goal to provide more useful and contextually relevant responses based on the instructions it's given."
+  },
+  {
+    "Question": "Why does InstructGPT work?",
+    "Answer": "InstructGPT works due to a two-step training process: pretraining on a large corpus of internet text, followed by fine-tuning with reinforcement learning from human feedback (RLHF). Pretraining helps the model learn grammar, facts about the world, and some level of reasoning. The second step, RLHF, fine-tunes the model using comparison data: multiple model responses ranked by quality by human evaluators. This two-step process allows the model to generalize from the feedback to unseen prompts and respond effectively."
+  },
+  {
+    "Question": "What are some commonly used evaluation metrics in InstructGPT?",
+    "Answer": "One main evaluation metric for InstructGPT is the quality of the responses as rated by human evaluators. They compare different completions to the same prompt and rate them, which provides a comparison data that is used to fine-tune the model. The model's ability to follow instructions, accuracy of information provided, relevance of the response, and comprehension of the prompt are other crucial metrics."
+  },
+  {
+    "Question": "How is InstructGPT used?",
+    "Answer": "InstructGPT can be used in any application that involves text-based instructions. Users provide a text prompt with an instruction and InstructGPT generates a response following that instruction. Potential use cases range from drafting emails or other pieces of writing, to generating Python code, explaining complex concepts in simple language, and more."
+  },
+  {
+    "Question": "What are some common applications of InstructGPT?",
+    "Answer": "Common applications of InstructGPT can be in educational software (explaining complex concepts, helping with homework), business (drafting emails, generating reports), coding (writing snippets of code), and general information retrieval and explanation in a conversational manner. It can also be used creatively, for example in story writing or idea brainstorming."
+  },
+  {
+    "Question": "How does InstructGPT handle ambiguous prompts?",
+    "Answer": "For ambiguous prompts, InstructGPT aims to ask clarifying questions. However, as of its last update, this is a challenge that the model struggles with, and an area of ongoing research. Often, the model may guess the user's intention instead of seeking clarification."
+  },
+  {
+    "Question": "Can InstructGPT generate incorrect or nonsensical responses?",
+    "Answer": "Yes, InstructGPT can sometimes produce plausible-sounding but incorrect or nonsensical answers. While the model is trained to generate accurate and sensible responses, errors can occur because it does not have a source of truth during RL training, and because it might get rewarded for plausible but incorrect answers."
+  },
+  {
+    "Question": "How does InstructGPT manage harmful and biased instructions?",
+    "Answer": "InstructGPT has a moderation system in place to refuse generating certain types of unsafe content. However, the system is not perfect and can have false positives and negatives. OpenAI uses ongoing research and user feedback to continually improve the safety and usefulness of models like InstructGPT."
+  },
+  {
+    "Question": "What is the role of human evaluators in the training of InstructGPT?",
+    "Answer": "Human evaluators play a crucial role in the training of InstructGPT. They follow guidelines provided by OpenAI to rank different responses by quality. This comparison data is then used to fine-tune the model. Over time, the guidelines are iteratively refined based on weekly meetings with the evaluators to address questions and provide clarifications."
+  },
+  {
+    "Question": "What are the limitations of InstructGPT?",
+    "Answer": "There are several limitations to InstructGPT. It might produce plausible but incorrect answers. It tends to be excessively verbose and overuses certain phrases. It can sometimes fail to ask clarifying questions in response to ambiguous queries."
+  },
+  {
+    "Question": "What is reinforcement learning with human feedback?",
+    "Answer": "Reinforcement learning with human feedback is a type of machine learning approach in which an artificial intelligence learns to make decisions based on feedback given by humans. This feedback is used to reinforce the AI's understanding of good and bad actions, improving its decision-making capabilities over time. It's a way of transferring human knowledge and values to the AI system, which can be particularly useful in complex environments where it may be difficult for the AI to learn purely from raw data."
+  },
+  {
+    "Question": "How is human feedback integrated into the reinforcement learning process?",
+    "Answer": "Human feedback can be integrated into reinforcement learning in a few ways. One common method is to use human evaluations as rewards or penalties for the AI's actions during training. In this approach, humans review the AI's actions and provide positive feedback for good actions and negative feedback for bad actions. This feedback is then used to adjust the AI's policy, nudging it towards better decisions. Alternatively, humans can be involved more directly in the learning process, for example by demonstrating correct behavior for the AI to imitate, or by providing input on the AI's proposed action plans."
+  },
+  {
+    "Question": "What are the advantages of reinforcement learning with human feedback?",
+    "Answer": "Reinforcement learning with human feedback can potentially lead to more robust, safe, and ethical AI systems. Humans can provide nuanced feedback that reflects complex real-world values and considerations, which can be difficult for an AI to learn on its own. This can be particularly valuable in scenarios where mistakes can have serious consequences, or where it's important for the AI to align closely with human values and norms. Additionally, human feedback can help to overcome some of the limitations of traditional reinforcement learning, such as the difficulty of specifying a suitable reward function in complex environments."
+  },
+  {
+    "Question": "What are some challenges of reinforcement learning with human feedback?",
+    "Answer": "One of the main challenges is the potential for bias in human feedback, which can lead to biased AI behavior. Also, providing feedback can be time-consuming and labor-intensive, which may limit the scalability of this approach. Furthermore, it can be difficult to provide consistent and accurate feedback in complex environments, especially when the optimal action is not clear. Finally, there's the challenge of extrapolation: the AI may need to make decisions in situations that were not covered by the human-provided examples, and it's not always clear how best to generalize from the feedback it has received."
+  },
+  {
+    "Question": "Can you give an example of an application of reinforcement learning with human feedback?",
+    "Answer": "One potential application of reinforcement learning with human feedback is in autonomous vehicles. In this context, human drivers could provide feedback on the AI's driving decisions, helping it to learn to drive safely and efficiently. The human feedback could complement other sources of training data, such as sensor inputs and historical driving data, enabling the AI to learn from a combination of human expertise and large-scale data analysis. This approach could help to address some of the challenges of autonomous driving, such as navigating complex traffic scenarios or making ethical decisions in emergency situations."
+  },
+  {
+    "Question": "What is 'step 1 supervised finetuning' in reinforcement learning with human feedback?",
+    "Answer": "'Step 1 supervised finetuning' in reinforcement learning with human feedback is the initial stage of the learning process where a model is trained on human-generated data. In this stage, the model is fine-tuned to perform tasks as a human demonstrator would. This involves providing the AI with examples of the correct behavior or decision, and then the AI uses this information to update its own parameters to match these demonstrations. This process is akin to supervised learning, hence the term 'supervised finetuning'."
+  },
+  {
+    "Question": "Why does step 1 supervised finetuning work in reinforcement learning with human feedback?",
+    "Answer": "Step 1 supervised finetuning works in reinforcement learning with human feedback because it provides the model with a strong initial policy that aligns with human demonstrations. This human-like behavior provides a useful starting point for the model, enabling it to behave intelligently even before it starts learning from its own experiences. The AI system can then build on this foundation, refining and expanding its capabilities through further reinforcement learning."
+  },
+  {
+    "Question": "What are some commonly used evaluation metrics in step 1 supervised finetuning?",
+    "Answer": "Common evaluation metrics in supervised finetuning typically assess how closely the model's behavior matches the human demonstrations. This can be measured in terms of accuracy (the proportion of decisions where the model's action matches the human demonstrator's action), or loss functions like cross-entropy loss (which measures the dissimilarity between the model's predicted action probabilities and the actual actions taken by the human demonstrator). Additionally, in reinforcement learning settings, one might also evaluate the model's performance in terms of the rewards it achieves in the task environment."
+  },
+  {
+    "Question": "How is step 1 supervised finetuning used in reinforcement learning with human feedback?",
+    "Answer": "In step 1 supervised finetuning, a model is first pretrained on a large dataset (for example, a dataset of text for a language model), and then further trained on a smaller, task-specific dataset that contains human demonstrations of the task at hand. The aim is to adjust the model's parameters so that it can mimic the human demonstrator's behavior as closely as possible. This involves presenting the model with input-output pairs from the human demonstrations, computing the model's loss on these examples, and then updating the model's parameters to minimize this loss."
+  },
+  {
+    "Question": "What are some common applications of step 1 supervised finetuning?",
+    "Answer": "Step 1 supervised finetuning can be used in a variety of reinforcement learning applications. One common application is in the training of autonomous vehicles, where human demonstrations can provide a useful starting point for the AI's driving policy. Other applications might include game playing (where the AI could be fine-tuned on human gameplay), robot control (where the AI could be fine-tuned on human control commands), and dialogue systems (where the AI could be fine-tuned on human-generated dialogues)."
+  },
+  {
+    "Question": "What is the role of human demonstrations in step 1 supervised finetuning?",
+    "Answer": "Human demonstrations play a crucial role in step 1 supervised finetuning. They serve as the training data that the model uses to learn the task at hand. These demonstrations can provide the model with a rich source of information about how to perform the task effectively, reflecting human expertise, strategy, and values. The goal of the finetuning process is to enable the model to mimic this human-like behavior as closely as possible."
+  },
+  {
+    "Question": "How is the training data collected for step 1 supervised finetuning?",
+    "Answer": "The training data for step 1 supervised finetuning is typically collected through human demonstrations of the task at hand. For example, in the context of autonomous driving, the training data might consist of recordings of human drivers' actions and the corresponding driving scenarios. These demonstrations are then processed into a format that the model can learn from, such as input-output pairs representing the driving scenario and the human driver's action."
+  },
+  {
+    "Question": "What are the limitations of step 1 supervised finetuning?",
+    "Answer": "While supervised finetuning can provide a strong starting point for reinforcement learning, it also has some limitations. One key limitation is that it relies on the quality of the human demonstrations: if the demonstrations are suboptimal, biased, or inconsistent, this could negatively impact the model's performance. Furthermore, supervised finetuning may not enable the model to exceed the level of performance of the human demonstrator, and it may struggle to generalize to scenarios that are not represented in the demonstrations."
+  },
+  {
+    "Question": "How does step 1 supervised finetuning fit into the broader reinforcement learning with human feedback pipeline?",
+    "Answer": "Step 1 supervised finetuning is typically the initial stage in the reinforcement learning with human feedback pipeline. After the model has been fine-tuned on human demonstrations, it can then proceed to further stages of learning, such as reinforcement learning from its own experiences and additional fine-tuning based on human feedback. These further stages can enable the model to refine and expand its capabilities, potentially surpassing the level of performance demonstrated in the initial human demonstrations."
+  },
+  {
+    "Question": "What are the prerequisites for using step 1 supervised finetuning in reinforcement learning with human feedback?",
+    "Answer": "To use step 1 supervised finetuning in reinforcement learning with human feedback, you need a model that is capable of learning from human demonstrations (such as a deep neural network), a dataset of human demonstrations for the task at hand, and a suitable loss function to guide the finetuning process. You also need the computational resources to train the model, as well as the expertise to design, implement, and evaluate the finetuning process."
+  },
+  {
+    "Question": "What is 'reward modeling' in reinforcement learning with human feedback?",
+    "Answer": "Reward modeling is the second step in reinforcement learning with human feedback. It's a process where an artificial intelligence agent learns what to do in a specific environment by getting a reward signal from a model trained on human evaluations. This 'reward model' is trained to predict the rewards or preferences that a human would give to different possible actions or outcomes, providing the AI with a kind of map of human values and preferences that it can use to guide its own behavior."
+  },
+  {
+    "Question": "Why does reward modeling work in reinforcement learning with human feedback?",
+    "Answer": "Reward modeling works because it bridges the gap between the relatively low-dimension human feedback and the high-dimension action space of the AI. By using a model to predict human rewards, the AI can generalize from the feedback it has received to a much wider range of situations. This enables the AI to learn how to behave in ways that align with human values and preferences, even in scenarios that were not covered by the initial human feedback."
+  },
+  {
+    "Question": "What are some commonly used evaluation metrics in reward modeling?",
+    "Answer": "In reward modeling, one common evaluation metric is the prediction error on a held-out validation set of human feedback. This measures how well the reward model is able to predict the rewards that humans would give to different actions or outcomes. Other evaluation metrics might be task-specific, such as the rewards achieved by the AI in the task environment, or measures of the AI's behavior, such as its safety or ethical compliance."
+  },
+  {
+    "Question": "How is reward modeling used in reinforcement learning with human feedback?",
+    "Answer": "In reward modeling, a model is trained on a dataset of human feedback, learning to predict the rewards or preferences that a human would assign to different actions or outcomes. The AI then uses this reward model to guide its own behavior, trying to take actions that the model predicts would receive high rewards from humans. This might involve techniques like reinforcement learning, where the AI learns a policy that maximizes the predicted rewards over time."
+  },
+  {
+    "Question": "What are some common applications of reward modeling?",
+    "Answer": "Reward modeling can be applied in any scenario where we want an AI to learn to behave in ways that align with human values and preferences. Some examples could include autonomous driving (where the AI learns to drive in a way that humans would consider safe and efficient), game playing (where the AI learns to play in a way that humans would consider skilled and strategic), and content recommendation (where the AI learns to recommend content that humans would find relevant and interesting)."
+  },
+  {
+    "Question": "What is the role of human feedback in reward modeling?",
+    "Answer": "Human feedback plays a critical role in reward modeling. It serves as the training data for the reward model, providing a source of information about human values and preferences. By learning to predict these human rewards, the AI can learn to behave in ways that align with human values, effectively using the human feedback as a compass to navigate its action space."
+  },
+  {
+    "Question": "How is the training data collected for reward modeling?",
+    "Answer": "The training data for reward modeling is typically collected by having humans evaluate different actions or outcomes. For example, in the context of a game-playing AI, humans might play the game themselves or evaluate AI-generated gameplay, giving feedback on different moves or strategies. This feedback is then processed into a format that the reward model can learn from, such as input-output pairs representing the action or outcome and the corresponding human reward."
+  },
+  {
+    "Question": "What are the limitations of reward modeling?",
+    "Answer": "One limitation of reward modeling is that it relies on the quality and representativeness of the human feedback. If the feedback is biased, inconsistent, or doesn't cover a wide range of situations, the reward model may struggle to make accurate predictions. Another challenge is that reward modeling often requires significant amounts of human feedback, which can be time-consuming and costly to collect. Finally, reward modeling may struggle to capture more complex aspects of human values and preferences, particularly in novel or ambiguous situations."
+  },
+  {
+    "Question": "How does reward modeling fit into the broader reinforcement learning with human feedback pipeline?",
+    "Answer": "Reward modeling is typically the second step in the reinforcement learning with human feedback pipeline, following the initial supervised finetuning. After the AI has learned to mimic human demonstrations through supervised finetuning, it can then use reward modeling to further refine its behavior based on human feedback. This can enable the AI to learn more nuanced behaviors, and to generalize to a wider range of situations, than could be achieved through supervised finetuning alone."
+  },
+  {
+    "Question": "What are the prerequisites for using reward modeling in reinforcement learning with human feedback?",
+    "Answer": "To use reward modeling in reinforcement learning with human feedback, you need a model that is capable of learning from human feedback (such as a deep neural network), a dataset of human feedback for the task at hand, and a suitable loss function to guide the training of the reward model. You also need the computational resources to train the reward model, as well as the expertise to design, implement, and evaluate the reward modeling process."
+  },
+  {
+    "Question": "What is training a reinforcement learning (RL) model in the context of reinforcement learning with human feedback?",
+    "Answer": "Training an RL model in this context refers to the third step in the process, where the agent uses the reward model developed in the previous step to learn an optimal policy. The agent interacts with the environment, makes decisions, and gets feedback from the reward model. It then adjusts its behavior in order to maximize the expected cumulative reward. This is an iterative process, gradually improving the policy over time."
+  },
+  {
+    "Question": "Why does training an RL model work in reinforcement learning with human feedback?",
+    "Answer": "Training an RL model works because it allows the model to learn from its own experiences. The model uses the feedback from the reward model to evaluate its actions and to gradually improve its policy over time. The feedback provides a form of supervision, guiding the model towards actions that are predicted to align with human preferences. As the model gathers more experience and receives more feedback, it can learn to perform the task more effectively."
+  },
+  {
+    "Question": "What are some commonly used evaluation metrics in training an RL model?",
+    "Answer": "The most common evaluation metric in training an RL model is the cumulative reward that the model achieves in the task environment. This can be measured, for example, as the average reward per episode, or the total reward over a fixed number of episodes. Other evaluation metrics might include task-specific performance measures (like accuracy or precision in a classification task), measures of the model's behavior (like safety or ethical compliance), or computational measures (like training time or computational complexity)."
+  },
+  {
+    "Question": "How is training an RL model used in reinforcement learning with human feedback?",
+    "Answer": "In training an RL model, the model interacts with the environment, makes decisions, and gets feedback from the reward model. It uses this feedback to adjust its behavior, trying to maximize the expected cumulative reward. This might involve techniques like Q-learning or policy gradient, where the model learns to estimate the value of different actions, or to directly optimize its policy. The model continues to learn and improve its policy as it gathers more experience and receives more feedback."
+  },
+  {
+    "Question": "What are some common applications of training an RL model?",
+    "Answer": "Training an RL model can be used in a wide range of applications. Some examples include game playing (where the model learns to play a game more effectively), autonomous driving (where the model learns to drive a vehicle safely and efficiently), robotics (where the model learns to control a robot to perform various tasks), recommendation systems (where the model learns to recommend items that users will like), and many more."
+  },
+  {
+    "Question": "How can loss explosion be avoided during the training of an RL model?",
+    "Answer": "Loss explosion can be avoided through a variety of techniques. One common approach is gradient clipping, where the gradient is artificially capped at a maximum value to prevent excessively large updates. Another approach is to use a learning rate schedule that gradually reduces the learning rate over time, making the updates more conservative as training progresses. Regularization techniques like weight decay or dropout can also help to prevent overfitting and stabilize the learning process."
+  },
+  {
+    "Question": "What are common failures in training an RL model?",
+    "Answer": "Common failures in training an RL model include instability (where the model's performance fluctuates wildly during training), slow convergence (where the model takes a long time to learn an effective policy), overfitting (where the model learns to perform well on the training data but performs poorly on new data), and catastrophic forgetting (where the model loses its ability to perform tasks it has previously learned while learning new tasks). Additionally, RL models can sometimes learn to exploit loopholes in the reward function, leading to undesirable behavior."
+  },
+  {
+    "Question": "What are the roles of exploration and exploitation in training an RL model?",
+    "Answer": "Exploration and exploitation play key roles in training an RL model. Exploration refers to the process of trying out new actions to gather information about the environment, while exploitation refers to the process of using the information the model has already learned to make the best decision. Balancing exploration and exploitation is a fundamental challenge in RL, as it requires the model to manage the trade-off between learning more about the environment and maximizing its reward."
+  },
+  {
+    "Question": "How does training an RL model fit into the broader reinforcement learning with human feedback pipeline?",
+    "Answer": "Training an RL model is typically the third step in the reinforcement learning with human feedback pipeline. After the model has been fine-tuned on human demonstrations (step 1) and a reward model has been trained based on human feedback (step 2), the model can then use this reward model to guide its learning in the task environment. This enables the model to refine its behavior based on its own experiences, potentially learning to perform the task more effectively than the initial human demonstrators."
+  },
+  {
+    "Question": "What are the prerequisites for training an RL model in reinforcement learning with human feedback?",
+    "Answer": "To train an RL model in reinforcement learning with human feedback, you need a model that is capable of learning from reward feedback (such as a deep neural network), a task environment where the model can interact and receive feedback, and a reward model that can provide feedback on the model's actions. You also need the computational resources to run the training process, as well as the expertise to design, implement, and evaluate the training process."
+  }
+]
\ No newline at end of file
diff --git a/example/rlhf/mlflow/supervised_finetuning_demo_mlflow.py b/example/rlhf/mlflow/supervised_finetuning_demo_mlflow.py
new file mode 100644
index 0000000..a54a218
--- /dev/null
+++ b/example/rlhf/mlflow/supervised_finetuning_demo_mlflow.py
@@ -0,0 +1,52 @@
+"""Demo for the supervised fine tuning.
+
+python -m example.rlhf.supervised_finetuning_demo_mlflow
+"""
+
+from pykoi.chat import QuestionAnswerDatabase
+from pykoi.rlhf import RLHFConfig
+from pykoi.rlhf import SupervisedFinetuning
+import mlflow
+import datetime
+
+from pykoi.chat.db.constants import (
+    QA_CSV_HEADER_ID,
+    QA_CSV_HEADER_QUESTION,
+    QA_CSV_HEADER_ANSWER,
+    QA_CSV_HEADER_VOTE_STATUS)
+
+# get data from local database
+qa_database = QuestionAnswerDatabase()
+my_data_pd = qa_database.retrieve_all_question_answers_as_pandas()
+my_data_pd = my_data_pd[[
+    QA_CSV_HEADER_ID,
+    QA_CSV_HEADER_QUESTION,
+    QA_CSV_HEADER_ANSWER,
+    QA_CSV_HEADER_VOTE_STATUS]]
+
+# analyze the data
+print(my_data_pd)
+print("My local database has {} samples in total".format(my_data_pd.shape[0]))
+
+# Set up mlflow experiment
+# mlflow.set_tracking_uri("http://x.x.x.x:5000")
+mlflow.set_experiment("rlhf_step1_sft/" + str(datetime.datetime.now()))
+
+# Set pykoi parameters
+base_model_path = "databricks/dolly-v2-3b"
+dataset_type = "local_db"
+peft_model_path = "./models/rlhf_step1_sft"
+
+# Manually log pykoi parameters into mlflow. Other parameters at torch level are automatically logged.
+mlflow.log_param("pykoi_base_model_path", base_model_path)
+mlflow.log_param("pykoi_dataset_type", dataset_type)
+mlflow.log_param("pykoi_peft_model_path", peft_model_path)
+
+# Run supervised finetuning.
+# Training metrics are automatically logged.
+config = RLHFConfig(base_model_path=base_model_path, dataset_type=dataset_type)
+rlhf_step1_sft = SupervisedFinetuning(config)
+rlhf_step1_sft.train_and_save(peft_model_path)
+
+# Save the trained peft model into mlflow artifacts.
+mlflow.log_artifacts(peft_model_path)
diff --git a/example/rlhf/ranking.csv b/example/rlhf/ranking.csv
new file mode 100644
index 0000000..dea9695
--- /dev/null
+++ b/example/rlhf/ranking.csv
@@ -0,0 +1,31 @@
+id,question,up_ranking_answer,low_ranking_answer
+1,who are u?, Unemployed, I'm a girl
+2,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+3,who are u?, Unemployed, I'm a girl
+4,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+5,how are u?, i am fine, I'm fine
+6,what is the roda?, The roda is a circle dance performed by a group of people who are called the roda., the roda is the wheel of the law that the buddha turned when he was teaching the dharma
+7,what is the roda?, the roda is the wheel of the law that the buddha turned when he was teaching the dharma, The roda is a circle dance performed by a group of people who are called the roda.
+8,what is the roda?, The roda is a circle dance performed by a group of people who are called the roda., the roda is the wheel of the law that the buddha turned when he was teaching the dharma
+9,what is the roda?, the roda is the wheel of the law that the buddha turned when he was teaching the dharma, The roda is a circle dance performed by a group of people who are called the roda.
+10,what is the roda?, The roda is a circle dance performed by a group of people who are called the roda., the roda is the wheel of the law that the buddha turned when he was teaching the dharma
+11,what is the roda?, the roda is the wheel of the law that the buddha turned when he was teaching the dharma, The roda is a circle dance performed by a group of people who are called the roda.
+12,what is the weather today?, The weather today is sunny with a high of 75 degrees., it is rainy today
+13,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+14,who are u?, I'm a girl, Unemployed
+15,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+16,what is the weather today?, The weather today is sunny with a high of 75 degrees., it is rainy today
+17,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+18,how are u?, i am fine, I'm fine
+19,how are u?, I'm fine, i am fine
+20,what is the weather today?, it is rainy today, The weather today is sunny with a high of 75 degrees.
+21,who is Jojo?, Jojo is a guy who has a blog on Tumblr and a YouTube channel where he vlogs about his life. He's a pretty big deal on the internet., Jojo is the narrator of the story.
+22,who is Jojo?, Jojo is a guy who has a blog on Tumblr and a YouTube channel where he vlogs about his life. He's a pretty big deal on the internet., Jojo is the narrator of the story.
+23,who is Jojo?, Jojo is a guy who has a blog on Tumblr and a YouTube channel where he vlogs about his life. He's a pretty big deal on the internet., Jojo is the narrator of the story.
+24,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+25,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+26,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+27,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+28,who is Jojo?, Jojo is the protagonist of Jojo's Bizarre Adventure., Jojo is a fictional character in the manga series JoJo's Bizarre Adventure.
+29,who is Kimi?, Kimi is the girl that is in the video and is a sex slave to the Rothschilds., Kimi is a very talented driver but he's not the fastest driver out there. He's not the best driver out there. He's not the driver who is always fastest in qualifying. He's not the driver who is always fastest in the race. He's not the driver who is always the best in the wet. He's not the driver who is always the best in the dry. He's not the driver who is always the best overtaker. He's not the driver who is always
+30,who is Kimi?, Kimi is a very talented driver but he's not the fastest driver out there. He's not the best driver out there. He's not the driver who is always fastest in qualifying. He's not the driver who is always fastest in the race. He's not the driver who is always the best in the wet. He's not the driver who is always the best in the dry. He's not the driver who is always the best overtaker. He's not the driver who is always, Kimi is the girl that is in the video and is a sex slave to the Rothschilds.
diff --git a/pyproject.toml b/pyproject.toml
index 4ea2f0a..73b518b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ datasets = { version = "2.13.1", optional = true }
 evaluate = { version = "0.4.0", optional = true }
 peft = { version = "0.5.0", optional = true }
 trl = { version = "0.4.7", optional = true }
+mlflow = { version = "2.7.1", optional = true }
 
 [tool.poetry.extras]
 huggingface = [
@@ -69,7 +70,8 @@ rlhf = [
     "datasets",
     "evaluate",
     "peft",
-    "trl"
+    "trl",
+    "mlflow"
 ]
 
 [tool.poetry.group.dev.dependencies]

	question	up_ranking_answer	low_ranking_answer
id
1	who are u?	Unemployed	I'm a girl
2	what is the weather today?	it is rainy today	The weather today is sunny with a high of 75 ...
3	who are u?	Unemployed	I'm a girl
4	what is the weather today?	it is rainy today	The weather today is sunny with a high of 75 ...
5	how are u?	i am fine	I'm fine
	Question	Answer
0	What is InstructGPT?	InstructGPT is a language model developed by O...
1	Why does InstructGPT work?	InstructGPT works due to a two-step training p...
2	What are some commonly used evaluation metrics...	One main evaluation metric for InstructGPT is ...
3	How is InstructGPT used?	InstructGPT can be used in any application tha...
4	What are some common applications of InstructGPT?	Common applications of InstructGPT can be in e...
5	How does InstructGPT handle ambiguous prompts?	For ambiguous prompts, InstructGPT aims to ask...
6	Can InstructGPT generate incorrect or nonsensi...	Yes, InstructGPT can sometimes produce plausib...
7	How does InstructGPT manage harmful and biased...	InstructGPT has a moderation system in place t...
8	What is the role of human evaluators in the tr...	Human evaluators play a crucial role in the tr...
9	What are the limitations of InstructGPT?	There are several limitations to InstructGPT. ...
10	What is reinforcement learning with human feed...	Reinforcement learning with human feedback is ...
11	How is human feedback integrated into the rein...	Human feedback can be integrated into reinforc...
12	What are the advantages of reinforcement learn...	Reinforcement learning with human feedback can...
13	What are some challenges of reinforcement lear...	One of the main challenges is the potential fo...
14	Can you give an example of an application of r...	One potential application of reinforcement lea...
15	What is 'step 1 supervised finetuning' in rein...	'Step 1 supervised finetuning' in reinforcemen...
16	Why does step 1 supervised finetuning work in ...	Step 1 supervised finetuning works in reinforc...
17	What are some commonly used evaluation metrics...	Common evaluation metrics in supervised finetu...
18	How is step 1 supervised finetuning used in re...	In step 1 supervised finetuning, a model is fi...
19	What are some common applications of step 1 su...	Step 1 supervised finetuning can be used in a ...
20	What is the role of human demonstrations in st...	Human demonstrations play a crucial role in st...
21	How is the training data collected for step 1 ...	The training data for step 1 supervised finetu...
22	What are the limitations of step 1 supervised ...	While supervised finetuning can provide a stro...
23	How does step 1 supervised finetuning fit into...	Step 1 supervised finetuning is typically the ...
24	What are the prerequisites for using step 1 su...	To use step 1 supervised finetuning in reinfor...
25	What is 'reward modeling' in reinforcement lea...	Reward modeling is the second step in reinforc...
26	Why does reward modeling work in reinforcement...	Reward modeling works because it bridges the g...
27	What are some commonly used evaluation metrics...	In reward modeling, one common evaluation metr...
28	How is reward modeling used in reinforcement l...	In reward modeling, a model is trained on a da...
29	What are some common applications of reward mo...	Reward modeling can be applied in any scenario...
30	What is the role of human feedback in reward m...	Human feedback plays a critical role in reward...
31	How is the training data collected for reward ...	The training data for reward modeling is typic...
32	What are the limitations of reward modeling?	One limitation of reward modeling is that it r...
33	How does reward modeling fit into the broader ...	Reward modeling is typically the second step i...
34	What are the prerequisites for using reward mo...	To use reward modeling in reinforcement learni...
35	What is training a reinforcement learning (RL)...	Training an RL model in this context refers to...
36	Why does training an RL model work in reinforc...	Training an RL model works because it allows t...
37	What are some commonly used evaluation metrics...	The most common evaluation metric in training ...
38	How is training an RL model used in reinforcem...	In training an RL model, the model interacts w...
39	What are some common applications of training ...	Training an RL model can be used in a wide ran...
40	How can loss explosion be avoided during the t...	Loss explosion can be avoided through a variet...
41	What are common failures in training an RL model?	Common failures in training an RL model includ...
42	What are the roles of exploration and exploita...	Exploration and exploitation play key roles in...
43	How does training an RL model fit into the bro...	Training an RL model is typically the third st...
44	What are the prerequisites for training an RL ...	To train an RL model in reinforcement learning...
	ID	Question	Answer	Vote Status	Timestamp
0	1	How is InstructGPT used?	InstructGPT can be used in any application tha...	up	2023-10-07 20:10:04.423359
1	2	What are some common applications of InstructGPT?	Common applications of InstructGPT can be in e...	up	2023-10-07 20:10:04.432908
2	3	How does InstructGPT handle ambiguous prompts?	For ambiguous prompts, InstructGPT aims to ask...	up	2023-10-07 20:10:04.439999
3	4	Can InstructGPT generate incorrect or nonsensi...	Yes, InstructGPT can sometimes produce plausib...	up	2023-10-07 20:10:04.446747
4	5	How does InstructGPT manage harmful and biased...	InstructGPT has a moderation system in place t...	up	2023-10-07 20:10:04.453973