From 68b44bc46e18181a470c7a902e699ab6a7f964b5 Mon Sep 17 00:00:00 2001 From: itz-sidd Date: Thu, 19 Feb 2026 23:15:13 +0530 Subject: [PATCH 1/5] fix: push NLI model and input tensors to device in AnswerPredictor --- backend/Generator/main.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/backend/Generator/main.py b/backend/Generator/main.py index 04aed79f..ed4ebbdb 100644 --- a/backend/Generator/main.py +++ b/backend/Generator/main.py @@ -23,6 +23,9 @@ import fitz import mammoth + + + class MCQGenerator: def __init__(self): @@ -251,6 +254,9 @@ def __init__(self): self.nli_tokenizer = AutoTokenizer.from_pretrained(self.nli_model_name) self.nli_model = AutoModelForSequenceClassification.from_pretrained(self.nli_model_name) + # Explicitly push the NLI model to the detected hardware (GPU or CPU) + self.nli_model.to(self.device) + self.set_seed(42) def set_seed(self, seed): @@ -296,6 +302,10 @@ def predict_boolean_answer(self, payload): for question in input_questions: hypothesis = question inputs = self.nli_tokenizer.encode_plus(input_text, hypothesis, return_tensors="pt") + + # Push the input tensors to the same device as the model + inputs = {key: value.to(self.device) for key, value in inputs.items()} + outputs = self.nli_model(**inputs) logits = outputs.logits probabilities = torch.softmax(logits, dim=1) From 13cad6e3e02a3e9dd05876bc4bb2e8a83d863b20 Mon Sep 17 00:00:00 2001 From: itz-sidd Date: Sun, 1 Mar 2026 23:24:06 +0530 Subject: [PATCH 2/5] Optimize NLI model with eval(), no_grad(), and device-aware inputs --- backend/Generator/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/Generator/main.py b/backend/Generator/main.py index ed4ebbdb..40646cef 100644 --- a/backend/Generator/main.py +++ b/backend/Generator/main.py @@ -256,6 +256,7 @@ def __init__(self): # Explicitly push the NLI model to the detected hardware (GPU or CPU) self.nli_model.to(self.device) + self.nli_model.eval() self.set_seed(42) @@ -292,7 +293,8 @@ def predict_answer(self, payload): torch.cuda.empty_cache() return answers - + + @torch.no_grad() def predict_boolean_answer(self, payload): input_text = payload.get("input_text", "") input_questions = payload.get("input_question", []) From f525d73e6e21161c38ef21cbf3ab33f39a8ed926 Mon Sep 17 00:00:00 2001 From: itz-sidd Date: Tue, 3 Mar 2026 16:04:41 +0530 Subject: [PATCH 3/5] Refactor: Implement Singleton ModelManager to fix memory leak --- backend/Generator/main.py | 66 +++++++++++++++++++++++++++++---------- backend/test_server.py | 2 +- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/backend/Generator/main.py b/backend/Generator/main.py index 40646cef..1d6833e6 100644 --- a/backend/Generator/main.py +++ b/backend/Generator/main.py @@ -26,17 +26,49 @@ -class MCQGenerator: - +class ModelManager: + """Singleton class to load and share massive ML models across generators.""" + _instance = None + _is_initialized = False + + def __new__(cls): + if cls._instance is None: + cls._instance = super(ModelManager, cls).__new__(cls) + return cls._instance + def __init__(self): - self.tokenizer = T5Tokenizer.from_pretrained('t5-large') - self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator') + if self._is_initialized: + return + + print("Initializing Shared ModelManager... Loading massive models into memory ONCE.") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model.to(self.device) + + self.qg_tokenizer = T5Tokenizer.from_pretrained('t5-large') + self.qg_model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator') + self.qg_model.to(self.device) + self.qg_model.eval() + self.nlp = spacy.load('en_core_web_sm') self.s2v = Sense2Vec().from_disk('s2v_old') self.fdist = FreqDist(brown.words()) self.normalized_levenshtein = NormalizedLevenshtein() + + self._is_initialized = True + + + + +class MCQGenerator: + + def __init__(self): + manager = ModelManager() + self.tokenizer = manager.qg_tokenizer + self.model = manager.qg_model + self.device = manager.device + self.nlp = manager.nlp + self.s2v = manager.s2v + self.fdist = manager.fdist + self.normalized_levenshtein = manager.normalized_levenshtein self.set_seed(42) def set_seed(self, seed): @@ -87,14 +119,14 @@ def generate_mcq(self, payload): class ShortQGenerator: def __init__(self): - self.tokenizer = T5Tokenizer.from_pretrained('t5-large') - self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator') - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model.to(self.device) - self.nlp = spacy.load('en_core_web_sm') - self.s2v = Sense2Vec().from_disk('s2v_old') - self.fdist = FreqDist(brown.words()) - self.normalized_levenshtein = NormalizedLevenshtein() + manager = ModelManager() + self.tokenizer = manager.qg_tokenizer + self.model = manager.qg_model + self.device = manager.device + self.nlp = manager.nlp + self.s2v = manager.s2v + self.fdist = manager.fdist + self.normalized_levenshtein = manager.normalized_levenshtein self.set_seed(42) def set_seed(self, seed): @@ -138,10 +170,10 @@ def generate_shortq(self, payload): class ParaphraseGenerator: def __init__(self): - self.tokenizer = T5Tokenizer.from_pretrained('t5-large') - self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator') - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model.to(self.device) + manager = ModelManager() + self.tokenizer = manager.qg_tokenizer + self.model = manager.qg_model + self.device = manager.device self.set_seed(42) def set_seed(self, seed): diff --git a/backend/test_server.py b/backend/test_server.py index 7a4bd38f..89647945 100644 --- a/backend/test_server.py +++ b/backend/test_server.py @@ -74,7 +74,7 @@ def test_root(): assert response.status_code == 200 def test_get_answer(): - endpoint = '/get_answer' + endpoint = '/get_shortq_answer' data = { 'input_text': input_text, 'input_question': [ From 97bf58526848416234961e53205967234a0d74d7 Mon Sep 17 00:00:00 2001 From: itz-sidd Date: Tue, 3 Mar 2026 16:53:07 +0530 Subject: [PATCH 4/5] Fix: Add thread-safe locking to ModelManager and update test name per PR review --- backend/Generator/main.py | 41 ++++++++++++++++++++++----------------- backend/test_server.py | 6 +++--- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/backend/Generator/main.py b/backend/Generator/main.py index 1d6833e6..04a8e4a5 100644 --- a/backend/Generator/main.py +++ b/backend/Generator/main.py @@ -22,40 +22,45 @@ import os import fitz import mammoth +import threading +import threading class ModelManager: """Singleton class to load and share massive ML models across generators.""" _instance = None _is_initialized = False + _lock = threading.Lock() - def __new__(cls): + def __new__(cls): if cls._instance is None: - cls._instance = super(ModelManager, cls).__new__(cls) + with cls._lock: + if cls._instance is None: + cls._instance = super(ModelManager, cls).__new__(cls) return cls._instance def __init__(self): if self._is_initialized: return - print("Initializing Shared ModelManager... Loading massive models into memory ONCE.") - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - self.qg_tokenizer = T5Tokenizer.from_pretrained('t5-large') - self.qg_model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator') - self.qg_model.to(self.device) - self.qg_model.eval() - - self.nlp = spacy.load('en_core_web_sm') - self.s2v = Sense2Vec().from_disk('s2v_old') - self.fdist = FreqDist(brown.words()) - self.normalized_levenshtein = NormalizedLevenshtein() - - self._is_initialized = True - - + with self._lock: + if not self._is_initialized: + print("Initializing Shared ModelManager... Loading massive models into memory ONCE.") + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + self.qg_tokenizer = T5Tokenizer.from_pretrained('t5-large') + self.qg_model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator') + self.qg_model.to(self.device) + self.qg_model.eval() + + self.nlp = spacy.load('en_core_web_sm') + self.s2v = Sense2Vec().from_disk('s2v_old') + self.fdist = FreqDist(brown.words()) + self.normalized_levenshtein = NormalizedLevenshtein() + + self._is_initialized = True class MCQGenerator: diff --git a/backend/test_server.py b/backend/test_server.py index 89647945..d7ed3962 100644 --- a/backend/test_server.py +++ b/backend/test_server.py @@ -73,7 +73,7 @@ def test_root(): print(f'Root Endpoint Response: {response.text}') assert response.status_code == 200 -def test_get_answer(): +def test_get_shortq_answer(): endpoint = '/get_shortq_answer' data = { 'input_text': input_text, @@ -85,7 +85,7 @@ def test_get_answer(): ] } response = make_post_request(endpoint, data) - print(f'/get_answer Response: {response}') + print(f"{endpoint} Response: {response}") assert 'output' in response def test_get_boolean_answer(): @@ -114,5 +114,5 @@ def make_post_request(endpoint, data): test_get_shortq() test_get_problems() test_root() - test_get_answer() + test_get_shortq_answer() test_get_boolean_answer() From 687653a0ba5c2f2593b6e54f692aeafa393da444 Mon Sep 17 00:00:00 2001 From: itz-sidd Date: Tue, 3 Mar 2026 17:00:17 +0530 Subject: [PATCH 5/5] Fix: Add thread-safe locking to ModelManager and update test name per PR review --- backend/Generator/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/Generator/main.py b/backend/Generator/main.py index 04a8e4a5..1f944d2c 100644 --- a/backend/Generator/main.py +++ b/backend/Generator/main.py @@ -26,7 +26,6 @@ -import threading class ModelManager: """Singleton class to load and share massive ML models across generators."""