local inference for batch size > 1

shubhamugare · shubhamugare · commit 2e3e22fa2843 · 2024-11-24T11:05:02.000-06:00
diff --git a/notebooks/example_json.ipynb b/notebooks/example_json.ipynb
@@ -9,32 +9,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.79it/s]\n",
-      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.84it/s]\n",
-      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Creating DFA mask store for CodeGenTokenizerFast and json, may take more than 10 minutes. Caching at /home/shubham/syncode/cache/mask_stores/CodeGenTokenizerFast/grammar_strict_1003218229_50257.pkl.\n",
-      "Ignore whitespace tokens is True\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 58/58 [00:22<00:00,  2.54it/s]\n"
+      "/home/shubham/anaconda3/envs/codex/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
    ],
@@ -44,8 +20,15 @@
     "import warnings\n",
     "warnings.filterwarnings('ignore')\n",
     "\n",
-    "model_name = \"microsoft/phi-2\"\n",
-    "\n",
+    "model_name = \"microsoft/phi-2\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Load the unconstrained original model\n",
     "llm = Syncode(model=model_name, mode='original', max_new_tokens=50)\n",
     "\n",
@@ -118,6 +101,88 @@
     "output = syn_llm.infer(prompt)[0]\n",
     "print(f\"SynCode output:\\n{output}\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.30it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING: Opportunistic mode requires batch_size of 1.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: Exact lookup not found for (LSQB, 1) in the DFA mask store. This could be an error.\n",
+      "Warning: Exact lookup not found for (UNESCAPED_STRING, 3) in the DFA mask store. This could be an error.\n",
+      "SynCode output 1:\n",
+      "{\n",
+      "   \"name\" : \"India\",\n",
+      "   \"capital\" : \"New Delhi\",\n",
+      "   \"population\" : \"1.3 billion\"\n",
+      "}\n",
+      "\n",
+      "SynCode output 2:\n",
+      "{\n",
+      "    \"name\" : \"India\"\n",
+      "    }\n",
+      "\n",
+      "SynCode output 3:\n",
+      "{\n",
+      "  \"name\" : \"India\"\n",
+      " , \"capital\" : \"New Delhi\"\n",
+      " , \"population\" : \"1.3 billion\"\n",
+      "}\n",
+      "\n",
+      "SynCode output 4:\n",
+      "{\n",
+      "  \"name\" : \"India\"\n",
+      "\n",
+      "SynCode output 5:\n",
+      "{\n",
+      "  \"name\" : \"India\"\n",
+      " , \"capital\" : \"New Delhi\"\n",
+      " , \"population\" : \"1367\n",
+      "}\n",
+      "\n",
+      "A:\n",
+      "\n",
+      "Try this :\n",
+      "\n",
+      "var data = [\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "syn_llm = Syncode(model=model_name, grammar='json', parse_output_only=True, max_new_tokens=50, num_return_sequences=5, do_sample=True, temperature=0.7)\n",
+    "\n",
+    "prompt = \"Please return a json object to represent country India with name, capital and population?\"\n",
+    "output = syn_llm.infer(prompt)\n",
+    "\n",
+    "for i, out in enumerate(output):\n",
+    "    out = out.strip()\n",
+    "    print(f\"SynCode output {i+1}:\\n{out}\\n\")"
+   ]
   }
  ],
  "metadata": {
diff --git a/syncode/language_model.py b/syncode/language_model.py
@@ -93,13 +93,13 @@ def generate_batch_completion_grammar(self, prompt, batch_size, stop_words=None,
             stop_criteria = []
 
         # Generate completions
-        if self.opp and (gen_mode == GenerationMode.SAMPLE or gen_mode == GenerationMode.GREEDY_SEARCH) and batch_size == 1: # Use our own implementation for greedy search and sampling
+        if self.opp and (gen_mode == GenerationMode.SAMPLE or gen_mode == GenerationMode.GREEDY_SEARCH): # Use our own implementation for greedy search and sampling
             generated_ids = self._generate(
                 inputs, 
                 gen_config, 
                 gen_mode, 
                 grammar_decoder=self.grammar_decoder,
-                stop_criteria=stop_criteria
+                stopping_criteria=stop_criteria
                 )
         else:
             if self.opp:
@@ -137,20 +137,19 @@ def _generate(
         gen_config:GenerationConfig, 
         gen_mode:GenerationMode, 
         grammar_decoder:SyncodeLogitsProcessor=None,
-        stop_criteria:StoppingCriteria=[]
+        stopping_criteria:StoppingCriteria=[]
         ):
         """
         We support greedy search and sampling for batch size 1 otherwise we use the generate function from transformers library.
         """
         token_ids, attention_mask, past_key_values = inputs['input_ids'], inputs['attention_mask'], None
-        
-        # This does not include grammar decoder
-        self.model._prepare_special_tokens(gen_config, False, device=self.device)
-        logits_processor = self.model._get_logits_processor(gen_config, token_ids.size(1), token_ids, prefix_allowed_tokens_fn=None, logits_processor=[])
-
+        logit_warper = self.model._get_logits_warper(gen_config, device=self.device)
         max_tokens = self.gen_args['max_new_tokens']+token_ids.size(1)
-
-        while True:
+        num_outputs = token_ids.size(0)
+        unfinished_sequences = torch.ones(num_outputs, dtype=torch.long, device=self.device)
+        this_peer_finished = False
+        
+        while not this_peer_finished:
             try:
                 if past_key_values: # Get the last token if kv is cached for all previous tokens
                     input_ids = token_ids[..., -1].unsqueeze(-1) 
@@ -168,30 +167,39 @@ def _generate(
             next_token_scores, past_key_values = outputs.logits[:, -1, :], outputs.past_key_values
             
             if grammar_decoder is not None:
-                next_token = self._get_next_token(gen_mode, token_ids, logits_processor, next_token_scores)
-                is_valid = grammar_decoder.is_valid(token_ids, next_token)
-
-                if not is_valid:
-                    # calling grammar decoder is expensive. Hence, in the opportunist mode, we call it only when the standard generation is syntactically incorrect
-                    next_token_scores = grammar_decoder(token_ids, next_token_scores)
-                    next_token = self._get_next_token(gen_mode, token_ids, logits_processor, next_token_scores)
+                # batch of next tokens
+                next_tokens = self._get_next_token(gen_mode, token_ids, logit_warper, next_token_scores)
+                
+                for idx in range(token_ids.size(0)):
+                    token_ids_i = token_ids[idx:idx+1]
+                    next_token_scores_i = next_token_scores[idx:idx+1]
+                    next_token_i = next_tokens[idx:idx+1]
+
+                    is_valid = grammar_decoder.is_valid(token_ids_i, next_token_i)
+
+                    if not is_valid:
+                        # calling grammar decoder is expensive. Hence, in the opportunist mode, we call it only when the standard generation is syntactically incorrect
+                        next_token_scores_i = grammar_decoder(token_ids_i, next_token_scores_i)
+                        next_tokens[idx] = self._get_next_token(gen_mode, token_ids_i, logit_warper, next_token_scores_i)
             else:
-                next_token = self._get_next_token(gen_mode, token_ids, logits_processor, next_token_scores)
+                next_tokens = self._get_next_token(gen_mode, token_ids, logit_warper, next_token_scores)
             
-            token_ids = torch.cat([token_ids, next_token[:, None]], dim=-1)
 
-            # Check stopping criteria
-            finish_generation = False
-            for stop_criterion in stop_criteria:
-                if stop_criterion(token_ids, next_token_scores):
-                    finish_generation = True
+            # Update the next token
+            next_tokens = next_tokens * unfinished_sequences + self.tokenizer.eos_token_id * (1 - unfinished_sequences)
+
+            token_ids = torch.cat([token_ids, next_tokens[:, None]], dim=-1)
                     
              # Check if the next token is the end of the sequence or the max tokens is reached
-            if finish_generation or next_token == self.tokenizer.eos_token_id or token_ids.size(1) >= max_tokens:
+            if token_ids.size(1) >= max_tokens:
                 break
 
             # Update attention mask
             attention_mask = torch.cat([attention_mask, torch.ones((attention_mask.size(0), 1), dtype=attention_mask.dtype).to(self.device)], dim=-1)
+
+            # Update the unfinished sequences
+            unfinished_sequences = unfinished_sequences & ~(stopping_criteria(token_ids, next_token_scores) | (token_ids[:, -1] == self.tokenizer.eos_token_id))
+            this_peer_finished = unfinished_sequences.max() == 0
             
         return token_ids