AOSSIE-Org · Prateekiiitg56 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 27, 2026
diff --git a/backend/._s2v_old b/backend/._s2v_old
diff --git a/backend/Generator/encoding.py b/backend/Generator/encoding.py
@@ -9,10 +9,11 @@ def greedy_decoding (inp_ids,attn_mask,model,tokenizer):
 
 
 def beam_search_decoding (inp_ids,attn_mask,model,tokenizer,num):
+  num_beams = max(10, num)  # num_beams must be >= num_return_sequences
   beam_output = model.generate(input_ids=inp_ids,
                                  attention_mask=attn_mask,
                                  max_length=256,
-                               num_beams=10,
+                               num_beams=num_beams,
                                num_return_sequences=num,
                                no_repeat_ngram_size=2,
                                early_stopping=True

diff --git a/backend/Generator/main.py b/backend/Generator/main.py
@@ -53,7 +53,13 @@ def generate_mcq(self, payload):
         sentences = tokenize_into_sentences(text)
         modified_text = " ".join(sentences)
 
-        keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
+        # Try to extract more keywords than requested, then filter down
+        # This increases the chance of finding enough valid keywords
+        target_keywords = min(inp['max_questions'] * 2, len(sentences))
+        keywords = identify_keywords(self.nlp, modified_text, target_keywords, self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
+
+        # Trim to requested amount after validation
+        keywords = keywords[:inp['max_questions']]
         keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)
 
         for k in keyword_sentence_mapping.keys():
@@ -110,7 +116,10 @@ def generate_shortq(self, payload):
         sentences = tokenize_into_sentences(text)
         modified_text = " ".join(sentences)
 
-        keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
+        # Extract 2x keywords to increase the chance of reaching max_questions
+        target_keywords = min(inp['max_questions'] * 2, len(sentences))
+        keywords = identify_keywords(self.nlp, modified_text, target_keywords, self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
+        keywords = keywords[:inp['max_questions']]
         keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)
 
         for k in keyword_sentence_mapping.keys():

diff --git a/backend/Generator/mcq.py b/backend/Generator/mcq.py
@@ -1,16 +1,16 @@
 import string
 import nltk
-import pke
 import torch
 from nltk.tokenize import sent_tokenize
 from flashtext import KeywordProcessor
 from nltk.corpus import stopwords
 from sense2vec import Sense2Vec
 from similarity.normalized_levenshtein import NormalizedLevenshtein
+import spacy
+from Generator.nltk_utils import safe_nltk_download
 
-nltk.download('brown')
-nltk.download('stopwords')
-nltk.download('popular')
+safe_nltk_download('corpora/brown')
+safe_nltk_download('corpora/stopwords')
 
 def is_word_available(word, s2v_model):
     word = word.replace(" ", "_")
@@ -57,15 +57,31 @@ def find_similar_words(word, s2v_model):
 def get_answer_choices(answer, s2v_model):
     choices = []
 
+    source = "sense2vec"
     try:
         choices = find_similar_words(answer, s2v_model)
         if len(choices) > 0:
             print("Generated choices successfully for word:", answer)
-            return choices, "sense2vec"
+            return choices, source
     except Exception as e:
         print(f"Failed to generate choices for word: {answer}. Error: {e}")
-
-    return choices, "None"
+
+    # Fallback: if sense2vec fails, generate generic distractors
+    if len(choices) < 3:
+        source = "fallback"
+        print(f"sense2vec returned {len(choices)} choices for '{answer}', adding generic fallbacks")
+        fallbacks = [
+            f"Not {answer}",
+            "None of the above",
+            "Incorrect option",
+            "Another answer",
+            "Different response"
+        ]
+        for fb in fallbacks:
+            if fb not in choices and len(choices) < 10:
+                choices.append(fb)
+
+    return choices, source
 
 def tokenize_into_sentences(text):
     sentences = [sent_tokenize(text)]
@@ -100,35 +116,43 @@ def are_words_distant(words_list, current_word, threshold, normalized_levenshtei
     score_list = [normalized_levenshtein.distance(word.lower(), current_word.lower()) for word in words_list]
     return min(score_list) >= threshold
 
-def filter_useful_phrases(phrase_keys, max_count, normalized_levenshtein):
+def filter_useful_phrases(phrase_keys, max_count, normalized_levenshtein, threshold=0.5):
     filtered_phrases = []
     if phrase_keys:
         filtered_phrases.append(phrase_keys[0])
         for ph in phrase_keys[1:]:
-            if are_words_distant(filtered_phrases, ph, 0.7, normalized_levenshtein):
+            if are_words_distant(filtered_phrases, ph, threshold, normalized_levenshtein):
                 filtered_phrases.append(ph)
             if len(filtered_phrases) >= max_count:
                 break
     return filtered_phrases
 
+# Lazy-loaded spaCy model cache for noun phrase extraction
+_spacy_nlp = None
+
+def _get_spacy_nlp():
+    global _spacy_nlp
+    if _spacy_nlp is None:
+        _spacy_nlp = spacy.load('en_core_web_sm')
+    return _spacy_nlp
+
 def extract_noun_phrases(text):
+    """Extract noun phrases using spaCy instead of pke"""
     out = []
-    extractor = pke.unsupervised.MultipartiteRank()
-    extractor.load_document(input=text, language='en')
-    pos = {'PROPN', 'NOUN'}
-    stoplist = list(string.punctuation)
-    stoplist += stopwords.words('english')
-    extractor.candidate_selection(pos=pos)
     try:
-        extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
+        nlp = _get_spacy_nlp()
+        doc = nlp(text)
+        # Extract noun phrases (multi-word nouns and proper nouns)
+        for chunk in doc.noun_chunks:
+            phrase = chunk.text.lower().strip()
+            if len(phrase.split()) > 1 and phrase not in out:
+                out.append(phrase)
+        # Limit to top 10
+        return out[:10]
     except Exception as e:
-        print(f"Error in candidate weighting: {e}")
+        print(f"Error extracting noun phrases: {e}")
         return out
 
-    keyphrases = extractor.get_n_best(n=10)
-    out = [key[0] for key in keyphrases]
-    return out
-
 def extract_phrases_from_doc(doc):
     phrases = {}
     for np in doc.noun_chunks:
@@ -195,6 +219,11 @@ def generate_multiple_choice_questions(keyword_sent_mapping, device, tokenizer,
         question_statement = decoded_question.replace("question:", "").strip()
         options, options_algorithm = get_answer_choices(answer, sense2vec_model)
         options = filter_useful_phrases(options, 10, normalized_levenshtein)
+
+        # Ensure we have at least 3 distractors
+        while len(options) < 3:
+            options.append(f"Option {len(options) + 1}")
+
         extra_options = options[3:]
         options = options[:3]
 

diff --git a/backend/Generator/nltk_utils.py b/backend/Generator/nltk_utils.py
@@ -0,0 +1,18 @@
+"""Shared NLTK utility to avoid duplicating _safe_nltk_download across modules."""
+import logging
+import nltk
+
+logger = logging.getLogger(__name__)
+
+
+def safe_nltk_download(pkg):
+    """Download an NLTK resource if not already present, logging failures."""
+    try:
+        nltk.data.find(pkg)
+    except LookupError:
+        try:
+            success = nltk.download(pkg.split('/')[-1], quiet=True, raise_on_error=False)
+            if not success:
+                logger.warning("NLTK resource '%s' download returned False — resource may be unavailable", pkg)
+        except Exception as e:
+            logger.warning("Failed to download NLTK resource '%s': %s", pkg, e)
diff --git a/backend/Generator/question_filters.py b/backend/Generator/question_filters.py
@@ -6,10 +6,12 @@
 
 # Initialize NLTK resources
 import nltk
-nltk.download('punkt', quiet=True)
-nltk.download('averaged_perceptron_tagger_eng', quiet=True)
-nltk.download('wordnet', quiet=True)
-nltk.download('stopwords', quiet=True)
+from Generator.nltk_utils import safe_nltk_download
+
+safe_nltk_download('tokenizers/punkt')
+safe_nltk_download('taggers/averaged_perceptron_tagger_eng')
+safe_nltk_download('corpora/wordnet')
+safe_nltk_download('corpora/stopwords')
 
 class QuestionEnhancer:
     def __init__(self):

diff --git a/backend/download_models.py b/backend/download_models.py
@@ -0,0 +1,68 @@
+"""
+Pre-download all required HuggingFace models to local cache.
+Run this once before starting the server.
+"""
+import os
+
+# Use platform-agnostic cache directory (override with HF_HOME env var)
+_default_cache = os.path.join(os.path.expanduser('~'), '.cache', 'huggingface')
+HF_CACHE_DIR = os.environ.get('HF_HOME', _default_cache)
+os.environ['HF_HOME'] = HF_CACHE_DIR
+os.environ['TRANSFORMERS_CACHE'] = os.path.join(HF_CACHE_DIR, 'transformers')
+
+print(f"Downloading models to {HF_CACHE_DIR} ...")
+print("This may take 10-30 minutes depending on your internet speed.\n")
+
+from transformers import (
+    T5ForConditionalGeneration, T5Tokenizer,
+    AutoModelForSequenceClassification, AutoTokenizer,
+    AutoModelForSeq2SeqLM
+)
+
+models = [
+    ('T5Tokenizer', 't5-large'),
+    ('T5ForConditionalGeneration', 'Roasters/Question-Generator'),
+    ('T5Tokenizer', 't5-base'),
+    ('T5ForConditionalGeneration', 'Roasters/Boolean-Questions'),
+    ('T5ForConditionalGeneration', 'Roasters/Answer-Predictor'),
+]
+
+for model_type, model_name in models:
+    print(f"  Downloading {model_name} ...")
+    try:
+        if model_type == 'T5Tokenizer':
+            T5Tokenizer.from_pretrained(model_name)
+        elif model_type == 'T5ForConditionalGeneration':
+            T5ForConditionalGeneration.from_pretrained(model_name)
+        print(f"  ✓ {model_name} done\n")
+    except Exception as e:
+        print(f"  ✗ {model_name} failed: {e}\n")
+
+# Also check for QG and QAE models used in QuestionGenerator / AnswerPredictor
+import re
+try:
+    with open(os.path.join(os.path.dirname(__file__), 'Generator', 'main.py')) as f:
+        content = f.read()
+    # Find QG_PRETRAINED and QAE_PRETRAINED values
+    qg = re.search(r"QG_PRETRAINED\s*=\s*['\"]([^'\"]+)['\"]", content)
+    qae = re.search(r"QAE_PRETRAINED\s*=\s*['\"]([^'\"]+)['\"]", content)
+    nli = re.search(r"nli_model_name\s*=\s*['\"]([^'\"]+)['\"]", content)
+
+    for match, label in [(qg, 'QG'), (qae, 'QAE'), (nli, 'NLI')]:
+        if match:
+            name = match.group(1)
+            print(f"  Downloading {label} model: {name} ...")
+            try:
+                AutoTokenizer.from_pretrained(name, use_fast=False)
+                AutoModelForSeq2SeqLM.from_pretrained(name)
+                print(f"  ✓ {label} done\n")
+            except Exception as e:
+                try:
+                    AutoModelForSequenceClassification.from_pretrained(name)
+                    print(f"  ✓ {label} done\n")
+                except Exception as e2:
+                    print(f"  ✗ {label} failed: {e2}\n")
+except Exception as e:
+    print(f"Could not parse main.py for additional models: {e}")
+
+print("\nAll downloads complete! You can now start server.py")