Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added backend/._s2v_old
Binary file not shown.
3 changes: 2 additions & 1 deletion backend/Generator/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ def greedy_decoding (inp_ids,attn_mask,model,tokenizer):


def beam_search_decoding (inp_ids,attn_mask,model,tokenizer,num):
num_beams = max(10, num) # num_beams must be >= num_return_sequences
beam_output = model.generate(input_ids=inp_ids,
attention_mask=attn_mask,
max_length=256,
num_beams=10,
num_beams=num_beams,
num_return_sequences=num,
no_repeat_ngram_size=2,
early_stopping=True
Expand Down
13 changes: 11 additions & 2 deletions backend/Generator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@ def generate_mcq(self, payload):
sentences = tokenize_into_sentences(text)
modified_text = " ".join(sentences)

keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
# Try to extract more keywords than requested, then filter down
# This increases the chance of finding enough valid keywords
target_keywords = min(inp['max_questions'] * 2, len(sentences))
keywords = identify_keywords(self.nlp, modified_text, target_keywords, self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))

# Trim to requested amount after validation
keywords = keywords[:inp['max_questions']]
keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)

for k in keyword_sentence_mapping.keys():
Expand Down Expand Up @@ -110,7 +116,10 @@ def generate_shortq(self, payload):
sentences = tokenize_into_sentences(text)
modified_text = " ".join(sentences)

keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
# Extract 2x keywords to increase the chance of reaching max_questions
target_keywords = min(inp['max_questions'] * 2, len(sentences))
keywords = identify_keywords(self.nlp, modified_text, target_keywords, self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
keywords = keywords[:inp['max_questions']]
keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)

for k in keyword_sentence_mapping.keys():
Expand Down
71 changes: 50 additions & 21 deletions backend/Generator/mcq.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import string
import nltk
import pke
import torch
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor
from nltk.corpus import stopwords
from sense2vec import Sense2Vec
from similarity.normalized_levenshtein import NormalizedLevenshtein
import spacy
from Generator.nltk_utils import safe_nltk_download

nltk.download('brown')
nltk.download('stopwords')
nltk.download('popular')
safe_nltk_download('corpora/brown')
safe_nltk_download('corpora/stopwords')

def is_word_available(word, s2v_model):
word = word.replace(" ", "_")
Expand Down Expand Up @@ -57,15 +57,31 @@ def find_similar_words(word, s2v_model):
def get_answer_choices(answer, s2v_model):
choices = []

source = "sense2vec"
try:
choices = find_similar_words(answer, s2v_model)
if len(choices) > 0:
print("Generated choices successfully for word:", answer)
return choices, "sense2vec"
return choices, source
except Exception as e:
print(f"Failed to generate choices for word: {answer}. Error: {e}")

return choices, "None"

# Fallback: if sense2vec fails, generate generic distractors
if len(choices) < 3:
source = "fallback"
print(f"sense2vec returned {len(choices)} choices for '{answer}', adding generic fallbacks")
fallbacks = [
f"Not {answer}",
"None of the above",
"Incorrect option",
"Another answer",
"Different response"
]
for fb in fallbacks:
if fb not in choices and len(choices) < 10:
choices.append(fb)

return choices, source

def tokenize_into_sentences(text):
sentences = [sent_tokenize(text)]
Expand Down Expand Up @@ -100,35 +116,43 @@ def are_words_distant(words_list, current_word, threshold, normalized_levenshtei
score_list = [normalized_levenshtein.distance(word.lower(), current_word.lower()) for word in words_list]
return min(score_list) >= threshold

def filter_useful_phrases(phrase_keys, max_count, normalized_levenshtein):
def filter_useful_phrases(phrase_keys, max_count, normalized_levenshtein, threshold=0.5):
filtered_phrases = []
if phrase_keys:
filtered_phrases.append(phrase_keys[0])
for ph in phrase_keys[1:]:
if are_words_distant(filtered_phrases, ph, 0.7, normalized_levenshtein):
if are_words_distant(filtered_phrases, ph, threshold, normalized_levenshtein):
filtered_phrases.append(ph)
if len(filtered_phrases) >= max_count:
break
return filtered_phrases

# Lazy-loaded spaCy model cache for noun phrase extraction
_spacy_nlp = None

def _get_spacy_nlp():
global _spacy_nlp
if _spacy_nlp is None:
_spacy_nlp = spacy.load('en_core_web_sm')
return _spacy_nlp

def extract_noun_phrases(text):
"""Extract noun phrases using spaCy instead of pke"""
out = []
extractor = pke.unsupervised.MultipartiteRank()
extractor.load_document(input=text, language='en')
pos = {'PROPN', 'NOUN'}
stoplist = list(string.punctuation)
stoplist += stopwords.words('english')
extractor.candidate_selection(pos=pos)
try:
extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average')
nlp = _get_spacy_nlp()
doc = nlp(text)
# Extract noun phrases (multi-word nouns and proper nouns)
for chunk in doc.noun_chunks:
phrase = chunk.text.lower().strip()
if len(phrase.split()) > 1 and phrase not in out:
out.append(phrase)
# Limit to top 10
return out[:10]
except Exception as e:
print(f"Error in candidate weighting: {e}")
print(f"Error extracting noun phrases: {e}")
return out

keyphrases = extractor.get_n_best(n=10)
out = [key[0] for key in keyphrases]
return out

def extract_phrases_from_doc(doc):
phrases = {}
for np in doc.noun_chunks:
Expand Down Expand Up @@ -195,6 +219,11 @@ def generate_multiple_choice_questions(keyword_sent_mapping, device, tokenizer,
question_statement = decoded_question.replace("question:", "").strip()
options, options_algorithm = get_answer_choices(answer, sense2vec_model)
options = filter_useful_phrases(options, 10, normalized_levenshtein)

# Ensure we have at least 3 distractors
while len(options) < 3:
options.append(f"Option {len(options) + 1}")

extra_options = options[3:]
options = options[:3]

Expand Down
18 changes: 18 additions & 0 deletions backend/Generator/nltk_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Shared NLTK utility to avoid duplicating _safe_nltk_download across modules."""
import logging
import nltk

logger = logging.getLogger(__name__)


def safe_nltk_download(pkg):
"""Download an NLTK resource if not already present, logging failures."""
try:
nltk.data.find(pkg)
except LookupError:
try:
success = nltk.download(pkg.split('/')[-1], quiet=True, raise_on_error=False)
if not success:
logger.warning("NLTK resource '%s' download returned False — resource may be unavailable", pkg)
except Exception as e:
logger.warning("Failed to download NLTK resource '%s': %s", pkg, e)
10 changes: 6 additions & 4 deletions backend/Generator/question_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@

# Initialize NLTK resources
import nltk
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
from Generator.nltk_utils import safe_nltk_download

safe_nltk_download('tokenizers/punkt')
safe_nltk_download('taggers/averaged_perceptron_tagger_eng')
safe_nltk_download('corpora/wordnet')
safe_nltk_download('corpora/stopwords')

class QuestionEnhancer:
def __init__(self):
Expand Down
68 changes: 68 additions & 0 deletions backend/download_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Pre-download all required HuggingFace models to local cache.
Run this once before starting the server.
"""
import os

# Use platform-agnostic cache directory (override with HF_HOME env var)
_default_cache = os.path.join(os.path.expanduser('~'), '.cache', 'huggingface')
HF_CACHE_DIR = os.environ.get('HF_HOME', _default_cache)
os.environ['HF_HOME'] = HF_CACHE_DIR
os.environ['TRANSFORMERS_CACHE'] = os.path.join(HF_CACHE_DIR, 'transformers')

print(f"Downloading models to {HF_CACHE_DIR} ...")
print("This may take 10-30 minutes depending on your internet speed.\n")

from transformers import (
T5ForConditionalGeneration, T5Tokenizer,
AutoModelForSequenceClassification, AutoTokenizer,
AutoModelForSeq2SeqLM
)

models = [
('T5Tokenizer', 't5-large'),
('T5ForConditionalGeneration', 'Roasters/Question-Generator'),
('T5Tokenizer', 't5-base'),
('T5ForConditionalGeneration', 'Roasters/Boolean-Questions'),
('T5ForConditionalGeneration', 'Roasters/Answer-Predictor'),
]

for model_type, model_name in models:
print(f" Downloading {model_name} ...")
try:
if model_type == 'T5Tokenizer':
T5Tokenizer.from_pretrained(model_name)
elif model_type == 'T5ForConditionalGeneration':
T5ForConditionalGeneration.from_pretrained(model_name)
print(f" ✓ {model_name} done\n")
except Exception as e:
print(f" ✗ {model_name} failed: {e}\n")

# Also check for QG and QAE models used in QuestionGenerator / AnswerPredictor
import re
try:
with open(os.path.join(os.path.dirname(__file__), 'Generator', 'main.py')) as f:
content = f.read()
# Find QG_PRETRAINED and QAE_PRETRAINED values
qg = re.search(r"QG_PRETRAINED\s*=\s*['\"]([^'\"]+)['\"]", content)
qae = re.search(r"QAE_PRETRAINED\s*=\s*['\"]([^'\"]+)['\"]", content)
nli = re.search(r"nli_model_name\s*=\s*['\"]([^'\"]+)['\"]", content)

for match, label in [(qg, 'QG'), (qae, 'QAE'), (nli, 'NLI')]:
if match:
name = match.group(1)
print(f" Downloading {label} model: {name} ...")
try:
AutoTokenizer.from_pretrained(name, use_fast=False)
AutoModelForSeq2SeqLM.from_pretrained(name)
print(f" ✓ {label} done\n")
except Exception as e:
try:
AutoModelForSequenceClassification.from_pretrained(name)
print(f" ✓ {label} done\n")
except Exception as e2:
print(f" ✗ {label} failed: {e2}\n")
except Exception as e:
print(f"Could not parse main.py for additional models: {e}")

print("\nAll downloads complete! You can now start server.py")
Loading