Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 115 additions & 30 deletions backend/Generator/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import time
import torch
import random
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer,AutoModelForSeq2SeqLM, T5ForConditionalGeneration, T5Tokenizer
from transformers import (
AutoModelForSequenceClassification,
AutoModelForSeq2SeqLM,
AutoTokenizer,
T5ForConditionalGeneration,
T5Tokenizer,
)
import numpy as np
import spacy
from sense2vec import Sense2Vec
Expand All @@ -14,14 +19,16 @@
from Generator.encoding import beam_search_decoding
from google.oauth2 import service_account
from googleapiclient.discovery import build
from werkzeug.utils import secure_filename
import en_core_web_sm
import json
import re
from typing import Any, List, Mapping, Tuple
import re
import os
import fitz
import mammoth
import uuid


class MCQGenerator:

Expand All @@ -31,7 +38,7 @@ def __init__(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.nlp = spacy.load('en_core_web_sm')
self.s2v = Sense2Vec().from_disk('s2v_old')
self.s2v = None
self.fdist = FreqDist(brown.words())
self.normalized_levenshtein = NormalizedLevenshtein()
self.set_seed(42)
Expand All @@ -53,7 +60,15 @@ def generate_mcq(self, payload):
sentences = tokenize_into_sentences(text)
modified_text = " ".join(sentences)

keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
keywords = identify_keywords(
self.nlp,
modified_text,
inp['max_questions'],
None, # disable sense2vec
self.fdist,
self.normalized_levenshtein,
len(sentences)
)
keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)

for k in keyword_sentence_mapping.keys():
Expand Down Expand Up @@ -89,7 +104,7 @@ def __init__(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.nlp = spacy.load('en_core_web_sm')
self.s2v = Sense2Vec().from_disk('s2v_old')
self.s2v = None
self.fdist = FreqDist(brown.words())
self.normalized_levenshtein = NormalizedLevenshtein()
self.set_seed(42)
Expand All @@ -110,7 +125,15 @@ def generate_shortq(self, payload):
sentences = tokenize_into_sentences(text)
modified_text = " ".join(sentences)

keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
keywords = identify_keywords(
self.nlp,
modified_text,
inp['max_questions'],
None, # disable sense2vec
self.fdist,
self.normalized_levenshtein,
len(sentences)
)
keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)

for k in keyword_sentence_mapping.keys():
Expand Down Expand Up @@ -160,7 +183,12 @@ def generate_paraphrase(self, payload):
sentence = text
text_to_paraphrase = "paraphrase: " + sentence + " </s>"

encoding = self.tokenizer.encode_plus(text_to_paraphrase, pad_to_max_length=True, return_tensors="pt")
encoding = self.tokenizer.encode_plus(
text_to_paraphrase,
padding="max_length",
truncation=True,
return_tensors="pt"
)
input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)

beam_outputs = self.model.generate(
Expand All @@ -171,7 +199,7 @@ def generate_paraphrase(self, payload):
num_return_sequences=num,
no_repeat_ngram_size=2,
early_stopping=True
)
)

final_outputs =[]
for beam_output in beam_outputs:
Expand Down Expand Up @@ -208,7 +236,6 @@ def random_choice(self):
a = random.choice([0,1])
return bool(a)


def generate_boolq(self, payload):
start_time = time.time()
inp = {
Expand All @@ -226,7 +253,7 @@ def generate_boolq(self, payload):
encoding = self.tokenizer.encode_plus(form, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)

output = beam_search_decoding (input_ids, attention_masks, self.model, self.tokenizer,num)
output = beam_search_decoding(input_ids, attention_masks, self.model, self.tokenizer, num)
if self.device.type == 'cuda':
torch.cuda.empty_cache()

Expand All @@ -237,7 +264,6 @@ def generate_boolq(self, payload):

return final


class AnswerPredictor:

def __init__(self):
Expand Down Expand Up @@ -267,11 +293,10 @@ def greedy_decoding(self, inp_ids, attn_mask):
def predict_answer(self, payload):
answers = []
inp = {
"input_text": payload.get("input_text"),
"input_question" : payload.get("input_question")
}
"input_text": payload.get("input_text"),
"input_question": payload.get("input_question")
}
for ques in payload.get("input_question"):

context = inp["input_text"]
question = ques
input_text = "question: %s <s> context: %s </s>" % (question, context)
Expand Down Expand Up @@ -348,7 +373,6 @@ def get_document_content(self, document_url):

return text.strip()


class FileProcessor:
def __init__(self, upload_folder='uploads/'):
self.upload_folder = upload_folder
Expand All @@ -367,21 +391,82 @@ def extract_text_from_docx(self, file_path):
result = mammoth.extract_raw_text(docx_file)
return result.value

def extract_text_from_image(self, file_path):
try:
import cv2
import pytesseract
import shutil
except ImportError as e:
raise RuntimeError(
"OCR requires opencv-python and pytesseract installed."
) from e

image = cv2.imread(file_path)
if image is None:
return ""

gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11,
2
)

# Cross-platform Tesseract discovery
tesseract_cmd = os.getenv("TESSERACT_CMD")
if tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
else:
detected = shutil.which("tesseract")
if detected:
pytesseract.pytesseract.tesseract_cmd = detected

text = pytesseract.image_to_string(thresh)
return text.strip()

def process_file(self, file):
file_path = os.path.join(self.upload_folder, file.filename)
file.save(file_path)
content = ""
safe_name = secure_filename(file.filename or "")
if not safe_name:
return ""

unique_name = f"{uuid.uuid4().hex}_{safe_name}"
file_path = os.path.join(self.upload_folder, unique_name)
# Extra safety check (prevents ../ traversal)
abs_upload = os.path.abspath(self.upload_folder)
abs_path = os.path.abspath(file_path)

if file.filename.endswith('.txt'):
with open(file_path, 'r') as f:
content = f.read()
elif file.filename.endswith('.pdf'):
content = self.extract_text_from_pdf(file_path)
elif file.filename.endswith('.docx'):
content = self.extract_text_from_docx(file_path)
if not abs_path.startswith(abs_upload):
return ""

os.remove(file_path)
return content
file.save(file_path)
content = ""
filename = safe_name.lower()

try:
if filename.endswith('.txt'):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
elif filename.endswith('.pdf'):
content = self.extract_text_from_pdf(file_path)
elif filename.endswith('.docx'):
content = self.extract_text_from_docx(file_path)
elif filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
content = self.extract_text_from_image(file_path)
return content

except Exception:
return ""


finally:
try:
if os.path.exists(file_path):
os.remove(file_path)
except OSError:
pass

class QuestionGenerator:
"""A transformer-based NLP system for generating reading comprehension-style questions from
Expand Down Expand Up @@ -803,4 +888,4 @@ def print_qa(qa_list: List[Mapping[str, str]], show_answers: bool = True) -> Non
# print full sentence answers
else:
if show_answers:
print(f"{space}A: {answer}\n")
print(f"{space}A: {answer}\n")
39 changes: 30 additions & 9 deletions backend/Generator/mcq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,25 @@
nltk.download('stopwords')
nltk.download('popular')

def is_word_available(word, s2v_model):
word = word.replace(" ", "_")
sense = s2v_model.get_best_sense(word)
if sense is not None:
def is_word_available(word, s2v_model, fdist, normalized_levenshtein):
"""
Checks if a word is valid for question generation.
Safely handles s2v_model=None.
"""

# If sense2vec disabled, skip sense check
if s2v_model is None:
return True
else:

try:
sense = s2v_model.get_best_sense(word)
if sense is None:
return False
except Exception:
return False

return True

def generate_word_variations(word):
letters = 'abcdefghijklmnopqrstuvwxyz ' + string.punctuation
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
Expand Down Expand Up @@ -55,6 +66,8 @@ def find_similar_words(word, s2v_model):
return out

def get_answer_choices(answer, s2v_model):
if s2v_model is None:
return [], "None"
choices = []

try:
Expand Down Expand Up @@ -177,8 +190,12 @@ def generate_multiple_choice_questions(keyword_sent_mapping, device, tokenizer,
text = context + " " + "answer: " + answer + " </s>"
batch_text.append(text)

encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")

encoding = tokenizer.batch_encode_plus(
batch_text,
padding="max_length",
truncation=True,
return_tensors="pt"
)
print("Generating questions using the model...")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

Expand Down Expand Up @@ -223,8 +240,12 @@ def generate_normal_questions(keyword_sent_mapping, device, tokenizer, model):
text = context + " " + "answer: " + answer + " </s>"
batch_text.append(text)

encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")

encoding = tokenizer.batch_encode_plus(
batch_text,
padding="max_length",
truncation=True,
return_tensors="pt"
)
print("Running model for generation...")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

Expand Down
Loading