From d1cf9c37f9a4a8268554683537396031dd1046f4 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 10:46:31 +0530 Subject: [PATCH 001/114] add requirement.txt file add requirement.txt file --- backend/Transform.py | 2 +- backend/requirements.txt | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/backend/Transform.py b/backend/Transform.py index 14a6976..2fb6988 100644 --- a/backend/Transform.py +++ b/backend/Transform.py @@ -4,8 +4,8 @@ import json import psycopg2 import io +from Utils import get_postgresql_connection -from EntityRecog.Utils import get_postgresql_connection def lambda_handler(event, context): for record in event['Records']: print(f"New record: {record}") diff --git a/backend/requirements.txt b/backend/requirements.txt index e8f021d..315c9c9 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,5 +1,4 @@ -fastapi -pydantic -pydantic-core -typing-extensions -mangum +numpy +pandas +boto3 +psycopg2 \ No newline at end of file From 829cea561627bf639ed8cd97e5431a084c86097f Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 10:59:41 +0530 Subject: [PATCH 002/114] removed unused libraries --- backend/Submit.py | 3 +-- backend/Transform.py | 1 - backend/requirements.txt | 3 ++- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/backend/Submit.py b/backend/Submit.py index 5cdf6db..d31ddc4 100644 --- a/backend/Submit.py +++ b/backend/Submit.py @@ -8,12 +8,11 @@ comprehend = boto3.client('comprehend') -input_s3_uri = 's3://awstraindata/input.csv' role_arn = 'arn:aws:iam::269854564686:role/hackathon-comprehend-role' bucket_name = 'awstraindata' s3 = boto3.client('s3') # Download the file object -input_csv_object = s3.get_object(Bucket=bucket_name, Key='input.csv') +input_csv_object = s3.get_object(Bucket=bucket_name, Key='input_small.csv') # Read CSV into DataFrame conn = get_postgresql_connection() diff --git a/backend/Transform.py b/backend/Transform.py index 2fb6988..07c34eb 100644 --- a/backend/Transform.py +++ b/backend/Transform.py @@ -2,7 +2,6 @@ import boto3 import tarfile import json -import psycopg2 import io from Utils import get_postgresql_connection diff --git a/backend/requirements.txt b/backend/requirements.txt index 315c9c9..28eb3db 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,4 +1,5 @@ numpy pandas boto3 -psycopg2 \ No newline at end of file +psycopg2 +tkinter \ No newline at end of file From d59acb3da855d0b20979747f9598c045388179b5 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 11:10:25 +0530 Subject: [PATCH 003/114] minor changes minor changes --- backend/Submit.py | 26 +- backend/Transform.py | 7 +- backend/Utils.py | 2 - backend/annotations.json | 962 --------------------------------------- backend/input_feed.json | 602 ------------------------ 5 files changed, 18 insertions(+), 1581 deletions(-) delete mode 100644 backend/annotations.json delete mode 100644 backend/input_feed.json diff --git a/backend/Submit.py b/backend/Submit.py index d31ddc4..476ceea 100644 --- a/backend/Submit.py +++ b/backend/Submit.py @@ -29,6 +29,15 @@ relevance_category TEXT, sentiment TEXT )""") +cursor.execute(""" + CREATE TABLE IF NOT EXISTS comprehend_jobs ( + article_id TEXT, + input_s3_uri TEXT, + entities_path TEXT, + sentiment_path TEXT, + key_phrases_path TEXT + ) + """) input_csv = pd.read_csv(io.BytesIO(input_csv_object['Body'].read())) for index, row in input_csv.iterrows(): print(f"Processing row {index}: {row}") @@ -54,7 +63,7 @@ OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, DataAccessRoleArn=role_arn, LanguageCode='en', - JobName='MyEntityDetectionJob_' + str(int(time.time())), + JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) ) result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] @@ -65,7 +74,7 @@ OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, DataAccessRoleArn=role_arn, LanguageCode='en', - JobName='MySentimentDetectionJob_' + str(int(time.time())), + JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) ) res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] @@ -76,7 +85,7 @@ OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, DataAccessRoleArn=role_arn, LanguageCode='en', - JobName='MyKeyPhrasesDetectionJob_' + str(int(time.time())), + JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) ) res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] @@ -86,15 +95,8 @@ cursor.execute(""" drop table if exists comprehend_jobs """) - cursor.execute(""" - CREATE TABLE IF NOT EXISTS comprehend_jobs ( - article_id TEXT, - input_s3_uri TEXT, - entities_path TEXT, - sentiment_path TEXT, - key_phrases_path TEXT - ) - """) + + print("Inserting into comprehend_jobs table") cursor.execute(""" INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) VALUES (%s, %s, %s, %s, %s)""", (articles_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) diff --git a/backend/Transform.py b/backend/Transform.py index 07c34eb..c06b4b4 100644 --- a/backend/Transform.py +++ b/backend/Transform.py @@ -1,4 +1,3 @@ -from turtle import pd import boto3 import tarfile import json @@ -45,12 +44,14 @@ def lambda_handler(event, context): if not relevance_category: cursor.execute("""update articles set relevance_category = %s where articles_id = %s""", (relevance_category, article_id)) elif type == 'sentiment': - sentiment = row.get('Sentiment', 'NEUTRAL') + sentiment = result.get('Sentiment', 'NEUTRAL') if not sentiment: cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, article_id)) elif type == 'keyphrases': - key_phrases = ', '.join(row.get('KeyPhrases', [])) + key_phrases = ', '.join(result.get('KeyPhrases', [])) if not key_phrases: cursor.execute("""update articles set key_phrases = %s where articles_id = %s""", (key_phrases, article_id)) cursor.close() + ## delete the s3 object + s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) conn.close() \ No newline at end of file diff --git a/backend/Utils.py b/backend/Utils.py index 371a1dc..41ed2b4 100644 --- a/backend/Utils.py +++ b/backend/Utils.py @@ -1,7 +1,5 @@ import json import psycopg2 -from psycopg2 import sql - def get_postgresql_connection(): '''get the creds from local config''' diff --git a/backend/annotations.json b/backend/annotations.json deleted file mode 100644 index a6625b0..0000000 --- a/backend/annotations.json +++ /dev/null @@ -1,962 +0,0 @@ -[ - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 30, - "label": "PLACE" - }, - { - "beginOffset": 47, - "endOffset": 53, - "label": "TOPIC" - }, - { - "beginOffset": 55, - "endOffset": 63, - "label": "TOPIC" - }, - { - "beginOffset": 69, - "endOffset": 80, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 26, - "label": "PLACE" - }, - { - "beginOffset": 43, - "endOffset": 50, - "label": "TOPIC" - }, - { - "beginOffset": 52, - "endOffset": 62, - "label": "TOPIC" - }, - { - "beginOffset": 68, - "endOffset": 74, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 29, - "label": "PLACE" - }, - { - "beginOffset": 46, - "endOffset": 52, - "label": "TOPIC" - }, - { - "beginOffset": 54, - "endOffset": 60, - "label": "TOPIC" - }, - { - "beginOffset": 66, - "endOffset": 74, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 49, - "label": "TOPIC" - }, - { - "beginOffset": 51, - "endOffset": 57, - "label": "TOPIC" - }, - { - "beginOffset": 63, - "endOffset": 73, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 51, - "label": "TOPIC" - }, - { - "beginOffset": 53, - "endOffset": 62, - "label": "TOPIC" - }, - { - "beginOffset": 68, - "endOffset": 74, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 29, - "label": "PLACE" - }, - { - "beginOffset": 46, - "endOffset": 52, - "label": "TOPIC" - }, - { - "beginOffset": 54, - "endOffset": 60, - "label": "TOPIC" - }, - { - "beginOffset": 66, - "endOffset": 73, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 27, - "label": "PLACE" - }, - { - "beginOffset": 44, - "endOffset": 51, - "label": "TOPIC" - }, - { - "beginOffset": 53, - "endOffset": 59, - "label": "TOPIC" - }, - { - "beginOffset": 65, - "endOffset": 79, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 24, - "label": "PLACE" - }, - { - "beginOffset": 41, - "endOffset": 51, - "label": "TOPIC" - }, - { - "beginOffset": 53, - "endOffset": 61, - "label": "TOPIC" - }, - { - "beginOffset": 67, - "endOffset": 72, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 27, - "label": "PLACE" - }, - { - "beginOffset": 44, - "endOffset": 52, - "label": "TOPIC" - }, - { - "beginOffset": 54, - "endOffset": 65, - "label": "TOPIC" - }, - { - "beginOffset": 71, - "endOffset": 76, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 24, - "label": "PLACE" - }, - { - "beginOffset": 41, - "endOffset": 50, - "label": "TOPIC" - }, - { - "beginOffset": 52, - "endOffset": 57, - "label": "TOPIC" - }, - { - "beginOffset": 63, - "endOffset": 70, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 49, - "label": "TOPIC" - }, - { - "beginOffset": 51, - "endOffset": 57, - "label": "TOPIC" - }, - { - "beginOffset": 63, - "endOffset": 69, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 54, - "label": "TOPIC" - }, - { - "beginOffset": 56, - "endOffset": 67, - "label": "TOPIC" - }, - { - "beginOffset": 73, - "endOffset": 82, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 50, - "label": "TOPIC" - }, - { - "beginOffset": 52, - "endOffset": 58, - "label": "TOPIC" - }, - { - "beginOffset": 64, - "endOffset": 73, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 27, - "label": "PLACE" - }, - { - "beginOffset": 44, - "endOffset": 55, - "label": "TOPIC" - }, - { - "beginOffset": 57, - "endOffset": 66, - "label": "TOPIC" - }, - { - "beginOffset": 72, - "endOffset": 82, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 45, - "label": "TOPIC" - }, - { - "beginOffset": 47, - "endOffset": 57, - "label": "TOPIC" - }, - { - "beginOffset": 63, - "endOffset": 70, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 27, - "label": "PLACE" - }, - { - "beginOffset": 44, - "endOffset": 54, - "label": "TOPIC" - }, - { - "beginOffset": 56, - "endOffset": 65, - "label": "TOPIC" - }, - { - "beginOffset": 71, - "endOffset": 80, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 24, - "label": "PLACE" - }, - { - "beginOffset": 41, - "endOffset": 48, - "label": "TOPIC" - }, - { - "beginOffset": 50, - "endOffset": 60, - "label": "TOPIC" - }, - { - "beginOffset": 66, - "endOffset": 80, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 26, - "label": "PLACE" - }, - { - "beginOffset": 43, - "endOffset": 49, - "label": "TOPIC" - }, - { - "beginOffset": 51, - "endOffset": 57, - "label": "TOPIC" - }, - { - "beginOffset": 63, - "endOffset": 70, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 29, - "label": "PLACE" - }, - { - "beginOffset": 46, - "endOffset": 54, - "label": "TOPIC" - }, - { - "beginOffset": 56, - "endOffset": 65, - "label": "TOPIC" - }, - { - "beginOffset": 71, - "endOffset": 80, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 50, - "label": "TOPIC" - }, - { - "beginOffset": 52, - "endOffset": 59, - "label": "TOPIC" - }, - { - "beginOffset": 65, - "endOffset": 74, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 24, - "label": "PLACE" - }, - { - "beginOffset": 41, - "endOffset": 48, - "label": "TOPIC" - }, - { - "beginOffset": 50, - "endOffset": 56, - "label": "TOPIC" - }, - { - "beginOffset": 62, - "endOffset": 68, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 29, - "label": "PLACE" - }, - { - "beginOffset": 46, - "endOffset": 56, - "label": "TOPIC" - }, - { - "beginOffset": 58, - "endOffset": 64, - "label": "TOPIC" - }, - { - "beginOffset": 70, - "endOffset": 79, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 46, - "label": "TOPIC" - }, - { - "beginOffset": 48, - "endOffset": 55, - "label": "TOPIC" - }, - { - "beginOffset": 61, - "endOffset": 71, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 49, - "label": "TOPIC" - }, - { - "beginOffset": 51, - "endOffset": 58, - "label": "TOPIC" - }, - { - "beginOffset": 64, - "endOffset": 73, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 26, - "label": "PLACE" - }, - { - "beginOffset": 43, - "endOffset": 53, - "label": "TOPIC" - }, - { - "beginOffset": 55, - "endOffset": 62, - "label": "TOPIC" - }, - { - "beginOffset": 68, - "endOffset": 77, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 24, - "label": "PLACE" - }, - { - "beginOffset": 41, - "endOffset": 55, - "label": "TOPIC" - }, - { - "beginOffset": 57, - "endOffset": 67, - "label": "TOPIC" - }, - { - "beginOffset": 73, - "endOffset": 80, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 24, - "label": "PLACE" - }, - { - "beginOffset": 41, - "endOffset": 47, - "label": "TOPIC" - }, - { - "beginOffset": 49, - "endOffset": 56, - "label": "TOPIC" - }, - { - "beginOffset": 62, - "endOffset": 71, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 52, - "label": "TOPIC" - }, - { - "beginOffset": 54, - "endOffset": 64, - "label": "TOPIC" - }, - { - "beginOffset": 70, - "endOffset": 77, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 49, - "label": "TOPIC" - }, - { - "beginOffset": 51, - "endOffset": 65, - "label": "TOPIC" - }, - { - "beginOffset": 71, - "endOffset": 80, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 48, - "label": "TOPIC" - }, - { - "beginOffset": 50, - "endOffset": 59, - "label": "TOPIC" - }, - { - "beginOffset": 65, - "endOffset": 74, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 47, - "label": "TOPIC" - }, - { - "beginOffset": 49, - "endOffset": 55, - "label": "TOPIC" - }, - { - "beginOffset": 61, - "endOffset": 71, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 46, - "label": "TOPIC" - }, - { - "beginOffset": 48, - "endOffset": 55, - "label": "TOPIC" - }, - { - "beginOffset": 61, - "endOffset": 70, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 29, - "label": "PLACE" - }, - { - "beginOffset": 46, - "endOffset": 53, - "label": "TOPIC" - }, - { - "beginOffset": 55, - "endOffset": 60, - "label": "TOPIC" - }, - { - "beginOffset": 66, - "endOffset": 76, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 49, - "label": "TOPIC" - }, - { - "beginOffset": 51, - "endOffset": 61, - "label": "TOPIC" - }, - { - "beginOffset": 67, - "endOffset": 76, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 24, - "label": "PLACE" - }, - { - "beginOffset": 41, - "endOffset": 52, - "label": "TOPIC" - }, - { - "beginOffset": 54, - "endOffset": 63, - "label": "TOPIC" - }, - { - "beginOffset": 69, - "endOffset": 79, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 30, - "label": "PLACE" - }, - { - "beginOffset": 47, - "endOffset": 57, - "label": "TOPIC" - }, - { - "beginOffset": 59, - "endOffset": 64, - "label": "TOPIC" - }, - { - "beginOffset": 70, - "endOffset": 75, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 25, - "label": "PLACE" - }, - { - "beginOffset": 42, - "endOffset": 51, - "label": "TOPIC" - }, - { - "beginOffset": 53, - "endOffset": 60, - "label": "TOPIC" - }, - { - "beginOffset": 66, - "endOffset": 75, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 27, - "label": "PLACE" - }, - { - "beginOffset": 44, - "endOffset": 50, - "label": "TOPIC" - }, - { - "beginOffset": 52, - "endOffset": 58, - "label": "TOPIC" - }, - { - "beginOffset": 64, - "endOffset": 71, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 54, - "label": "TOPIC" - }, - { - "beginOffset": 56, - "endOffset": 67, - "label": "TOPIC" - }, - { - "beginOffset": 73, - "endOffset": 80, - "label": "TOPIC" - } - ] - }, - { - "annotations": [ - { - "beginOffset": 17, - "endOffset": 23, - "label": "PLACE" - }, - { - "beginOffset": 40, - "endOffset": 46, - "label": "TOPIC" - }, - { - "beginOffset": 48, - "endOffset": 58, - "label": "TOPIC" - }, - { - "beginOffset": 64, - "endOffset": 71, - "label": "TOPIC" - } - ] - } -] \ No newline at end of file diff --git a/backend/input_feed.json b/backend/input_feed.json deleted file mode 100644 index ac56db2..0000000 --- a/backend/input_feed.json +++ /dev/null @@ -1,602 +0,0 @@ -[ - { - "title": "Visakhapatnam news update: Policy, heritage, environment", - "body": "Recent events in Visakhapatnam have focused on policy, heritage, and environment. Authorities are responding accordingly to ensure public welfare.", - "source": "New Indian Express", - "publishedDate": "01/06/2025", - "extractedLocations": [ - "Visakhapatnam" - ], - "districtMapping": "Visakhapatnam", - "tags": [ - "policy", - "heritage", - "environment" - ] - }, - { - "title": "Hyderabad news update: Weather, cybercrime, policy", - "body": "Recent events in Hyderabad have focused on weather, cybercrime, and policy. Authorities are responding accordingly to ensure public welfare.", - "source": "Eenadu", - "publishedDate": "29/06/2025", - "extractedLocations": [ - "Hyderabad" - ], - "districtMapping": "Hyderabad", - "tags": [ - "weather", - "cybercrime", - "policy" - ] - }, - { - "title": "Vizianagaram news update: Policy, sports, heritage", - "body": "Recent events in Vizianagaram have focused on policy, sports, and heritage. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "03/06/2025", - "extractedLocations": [ - "Vizianagaram" - ], - "districtMapping": "Vizianagaram", - "tags": [ - "policy", - "sports", - "heritage" - ] - }, - { - "title": "Ongole news update: Education, policy, employment", - "body": "Recent events in Ongole have focused on education, policy, and employment. Authorities are responding accordingly to ensure public welfare.", - "source": "Andhra Jyothy", - "publishedDate": "25/06/2025", - "extractedLocations": [ - "Ongole" - ], - "districtMapping": "Prakasam", - "tags": [ - "education", - "policy", - "employment" - ] - }, - { - "title": "Tirupati news update: Transport, elections, policy", - "body": "Recent events in Tirupati have focused on transport, elections, and policy. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "10/06/2025", - "extractedLocations": [ - "Tirupati" - ], - "districtMapping": "Tirupati", - "tags": [ - "transport", - "elections", - "policy" - ] - }, - { - "title": "Vizianagaram news update: Policy, sports, startup", - "body": "Recent events in Vizianagaram have focused on policy, sports, and startup. Authorities are responding accordingly to ensure public welfare.", - "source": "Deccan Chronicle", - "publishedDate": "15/06/2025", - "extractedLocations": [ - "Vizianagaram" - ], - "districtMapping": "Vizianagaram", - "tags": [ - "policy", - "sports", - "startup" - ] - }, - { - "title": "Vijayawada news update: Startup, sports, infrastructure", - "body": "Recent events in Vijayawada have focused on startup, sports, and infrastructure. Authorities are responding accordingly to ensure public welfare.", - "source": "New Indian Express", - "publishedDate": "25/06/2025", - "extractedLocations": [ - "Vijayawada" - ], - "districtMapping": "Krishna", - "tags": [ - "startup", - "sports", - "infrastructure" - ] - }, - { - "title": "Nellore news update: Cybercrime, heritage, flood", - "body": "Recent events in Nellore have focused on cybercrime, heritage, and flood. Authorities are responding accordingly to ensure public welfare.", - "source": "Deccan Chronicle", - "publishedDate": "16/06/2025", - "extractedLocations": [ - "Nellore" - ], - "districtMapping": "Nellore", - "tags": [ - "cybercrime", - "heritage", - "flood" - ] - }, - { - "title": "Vijayawada news update: Heritage, environment, flood", - "body": "Recent events in Vijayawada have focused on heritage, environment, and flood. Authorities are responding accordingly to ensure public welfare.", - "source": "New Indian Express", - "publishedDate": "07/06/2025", - "extractedLocations": [ - "Vijayawada" - ], - "districtMapping": "Krishna", - "tags": [ - "heritage", - "environment", - "flood" - ] - }, - { - "title": "Kurnool news update: Transport, crime, culture", - "body": "Recent events in Kurnool have focused on transport, crime, and culture. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "29/06/2025", - "extractedLocations": [ - "Kurnool" - ], - "districtMapping": "Kurnool", - "tags": [ - "transport", - "crime", - "culture" - ] - }, - { - "title": "Guntur news update: Elections, policy, health", - "body": "Recent events in Guntur have focused on elections, policy, and health. Authorities are responding accordingly to ensure public welfare.", - "source": "Sakshi", - "publishedDate": "13/06/2025", - "extractedLocations": [ - "Guntur" - ], - "districtMapping": "Guntur", - "tags": [ - "elections", - "policy", - "health" - ] - }, - { - "title": "Guntur news update: Infrastructure, environment, transport", - "body": "Recent events in Guntur have focused on infrastructure, environment, and transport. Authorities are responding accordingly to ensure public welfare.", - "source": "Deccan Chronicle", - "publishedDate": "06/06/2025", - "extractedLocations": [ - "Guntur" - ], - "districtMapping": "Guntur", - "tags": [ - "infrastructure", - "environment", - "transport" - ] - }, - { - "title": "Guntur news update: Cybercrime, energy, transport", - "body": "Recent events in Guntur have focused on cybercrime, energy, and transport. Authorities are responding accordingly to ensure public welfare.", - "source": "Sakshi", - "publishedDate": "14/06/2025", - "extractedLocations": [ - "Guntur" - ], - "districtMapping": "Guntur", - "tags": [ - "cybercrime", - "energy", - "transport" - ] - }, - { - "title": "Srikakulam news update: Environment, elections, government", - "body": "Recent events in Srikakulam have focused on environment, elections, and government. Authorities are responding accordingly to ensure public welfare.", - "source": "Times of India", - "publishedDate": "06/06/2025", - "extractedLocations": [ - "Srikakulam" - ], - "districtMapping": "Srikakulam", - "tags": [ - "environment", - "elections", - "government" - ] - }, - { - "title": "Ongole news update: Flood, government, weather", - "body": "Recent events in Ongole have focused on flood, government, and weather. Authorities are responding accordingly to ensure public welfare.", - "source": "Times of India", - "publishedDate": "06/06/2025", - "extractedLocations": [ - "Ongole" - ], - "districtMapping": "Prakasam", - "tags": [ - "flood", - "government", - "weather" - ] - }, - { - "title": "Srikakulam news update: Government, transport, education", - "body": "Recent events in Srikakulam have focused on government, transport, and education. Authorities are responding accordingly to ensure public welfare.", - "source": "Sakshi", - "publishedDate": "07/06/2025", - "extractedLocations": [ - "Srikakulam" - ], - "districtMapping": "Srikakulam", - "tags": [ - "government", - "transport", - "education" - ] - }, - { - "title": "Kurnool news update: Weather, government, infrastructure", - "body": "Recent events in Kurnool have focused on weather, government, and infrastructure. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "11/06/2025", - "extractedLocations": [ - "Kurnool" - ], - "districtMapping": "Kurnool", - "tags": [ - "weather", - "government", - "infrastructure" - ] - }, - { - "title": "Anantapur news update: Policy, health, weather", - "body": "Recent events in Anantapur have focused on policy, health, and weather. Authorities are responding accordingly to ensure public welfare.", - "source": "New Indian Express", - "publishedDate": "15/06/2025", - "extractedLocations": [ - "Anantapur" - ], - "districtMapping": "Anantapur", - "tags": [ - "policy", - "health", - "weather" - ] - }, - { - "title": "Vizianagaram news update: Heritage, transport, education", - "body": "Recent events in Vizianagaram have focused on heritage, transport, and education. Authorities are responding accordingly to ensure public welfare.", - "source": "Deccan Chronicle", - "publishedDate": "08/06/2025", - "extractedLocations": [ - "Vizianagaram" - ], - "districtMapping": "Vizianagaram", - "tags": [ - "heritage", - "transport", - "education" - ] - }, - { - "title": "Tirupati news update: Heritage, weather, education", - "body": "Recent events in Tirupati have focused on heritage, weather, and education. Authorities are responding accordingly to ensure public welfare.", - "source": "Deccan Chronicle", - "publishedDate": "18/06/2025", - "extractedLocations": [ - "Tirupati" - ], - "districtMapping": "Tirupati", - "tags": [ - "heritage", - "weather", - "education" - ] - }, - { - "title": "Kurnool news update: Startup, sports, policy", - "body": "Recent events in Kurnool have focused on startup, sports, and policy. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "08/06/2025", - "extractedLocations": [ - "Kurnool" - ], - "districtMapping": "Kurnool", - "tags": [ - "startup", - "sports", - "policy" - ] - }, - { - "title": "Vizianagaram news update: Technology, health, education", - "body": "Recent events in Vizianagaram have focused on technology, health, and education. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "03/06/2025", - "extractedLocations": [ - "Vizianagaram" - ], - "districtMapping": "Vizianagaram", - "tags": [ - "technology", - "health", - "education" - ] - }, - { - "title": "Kadapa news update: Health, weather, employment", - "body": "Recent events in Kadapa have focused on health, weather, and employment. Authorities are responding accordingly to ensure public welfare.", - "source": "New Indian Express", - "publishedDate": "19/06/2025", - "extractedLocations": [ - "Kadapa" - ], - "districtMapping": "Kadapa", - "tags": [ - "health", - "weather", - "employment" - ] - }, - { - "title": "Tirupati news update: Culture, startup, transport", - "body": "Recent events in Tirupati have focused on culture, startup, and transport. Authorities are responding accordingly to ensure public welfare.", - "source": "Eenadu", - "publishedDate": "04/06/2025", - "extractedLocations": [ - "Tirupati" - ], - "districtMapping": "Tirupati", - "tags": [ - "culture", - "startup", - "transport" - ] - }, - { - "title": "Anantapur news update: Employment, culture, education", - "body": "Recent events in Anantapur have focused on employment, culture, and education. Authorities are responding accordingly to ensure public welfare.", - "source": "Eenadu", - "publishedDate": "22/06/2025", - "extractedLocations": [ - "Anantapur" - ], - "districtMapping": "Anantapur", - "tags": [ - "employment", - "culture", - "education" - ] - }, - { - "title": "Kurnool news update: Infrastructure, cybercrime, culture", - "body": "Recent events in Kurnool have focused on infrastructure, cybercrime, and culture. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "14/06/2025", - "extractedLocations": [ - "Kurnool" - ], - "districtMapping": "Kurnool", - "tags": [ - "infrastructure", - "cybercrime", - "culture" - ] - }, - { - "title": "Kurnool news update: Policy, culture, education", - "body": "Recent events in Kurnool have focused on policy, culture, and education. Authorities are responding accordingly to ensure public welfare.", - "source": "Eenadu", - "publishedDate": "08/06/2025", - "extractedLocations": [ - "Kurnool" - ], - "districtMapping": "Kurnool", - "tags": [ - "policy", - "culture", - "education" - ] - }, - { - "title": "Tirupati news update: Technology, employment, weather", - "body": "Recent events in Tirupati have focused on technology, employment, and weather. Authorities are responding accordingly to ensure public welfare.", - "source": "Sakshi", - "publishedDate": "17/06/2025", - "extractedLocations": [ - "Tirupati" - ], - "districtMapping": "Tirupati", - "tags": [ - "technology", - "employment", - "weather" - ] - }, - { - "title": "Tirupati news update: Weather, infrastructure, elections", - "body": "Recent events in Tirupati have focused on weather, infrastructure, and elections. Authorities are responding accordingly to ensure public welfare.", - "source": "Times of India", - "publishedDate": "21/06/2025", - "extractedLocations": [ - "Tirupati" - ], - "districtMapping": "Tirupati", - "tags": [ - "weather", - "infrastructure", - "elections" - ] - }, - { - "title": "Tirupati news update: Sports, education, transport", - "body": "Recent events in Tirupati have focused on sports, education, and transport. Authorities are responding accordingly to ensure public welfare.", - "source": "Times of India", - "publishedDate": "02/06/2025", - "extractedLocations": [ - "Tirupati" - ], - "districtMapping": "Tirupati", - "tags": [ - "sports", - "education", - "transport" - ] - }, - { - "title": "Ongole news update: Weather, health, employment", - "body": "Recent events in Ongole have focused on weather, health, and employment. Authorities are responding accordingly to ensure public welfare.", - "source": "Deccan Chronicle", - "publishedDate": "21/06/2025", - "extractedLocations": [ - "Ongole" - ], - "districtMapping": "Prakasam", - "tags": [ - "weather", - "health", - "employment" - ] - }, - { - "title": "Kadapa news update: Policy, culture, elections", - "body": "Recent events in Kadapa have focused on policy, culture, and elections. Authorities are responding accordingly to ensure public welfare.", - "source": "Sakshi", - "publishedDate": "12/06/2025", - "extractedLocations": [ - "Kadapa" - ], - "districtMapping": "Kadapa", - "tags": [ - "policy", - "culture", - "elections" - ] - }, - { - "title": "Vizianagaram news update: Weather, crime, cybercrime", - "body": "Recent events in Vizianagaram have focused on weather, crime, and cybercrime. Authorities are responding accordingly to ensure public welfare.", - "source": "Sakshi", - "publishedDate": "01/06/2025", - "extractedLocations": [ - "Vizianagaram" - ], - "districtMapping": "Vizianagaram", - "tags": [ - "weather", - "crime", - "cybercrime" - ] - }, - { - "title": "Tirupati news update: Startup, employment, transport", - "body": "Recent events in Tirupati have focused on startup, employment, and transport. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "21/06/2025", - "extractedLocations": [ - "Tirupati" - ], - "districtMapping": "Tirupati", - "tags": [ - "startup", - "employment", - "transport" - ] - }, - { - "title": "Nellore news update: Environment, transport, technology", - "body": "Recent events in Nellore have focused on environment, transport, and technology. Authorities are responding accordingly to ensure public welfare.", - "source": "Sakshi", - "publishedDate": "07/06/2025", - "extractedLocations": [ - "Nellore" - ], - "districtMapping": "Nellore", - "tags": [ - "environment", - "transport", - "technology" - ] - }, - { - "title": "Visakhapatnam news update: Technology, flood, crime", - "body": "Recent events in Visakhapatnam have focused on technology, flood, and crime. Authorities are responding accordingly to ensure public welfare.", - "source": "Andhra Jyothy", - "publishedDate": "14/06/2025", - "extractedLocations": [ - "Visakhapatnam" - ], - "districtMapping": "Visakhapatnam", - "tags": [ - "technology", - "flood", - "crime" - ] - }, - { - "title": "Chittoor news update: Transport, weather, education", - "body": "Recent events in Chittoor have focused on transport, weather, and education. Authorities are responding accordingly to ensure public welfare.", - "source": "Eenadu", - "publishedDate": "08/06/2025", - "extractedLocations": [ - "Chittoor" - ], - "districtMapping": "Chittoor", - "tags": [ - "transport", - "weather", - "education" - ] - }, - { - "title": "Vijayawada news update: Health, policy, culture", - "body": "Recent events in Vijayawada have focused on health, policy, and culture. Authorities are responding accordingly to ensure public welfare.", - "source": "Eenadu", - "publishedDate": "12/06/2025", - "extractedLocations": [ - "Vijayawada" - ], - "districtMapping": "Krishna", - "tags": [ - "health", - "policy", - "culture" - ] - }, - { - "title": "Kadapa news update: Infrastructure, environment, weather", - "body": "Recent events in Kadapa have focused on infrastructure, environment, and weather. Authorities are responding accordingly to ensure public welfare.", - "source": "New Indian Express", - "publishedDate": "09/06/2025", - "extractedLocations": [ - "Kadapa" - ], - "districtMapping": "Kadapa", - "tags": [ - "infrastructure", - "environment", - "weather" - ] - }, - { - "title": "Kadapa news update: Health, government, weather", - "body": "Recent events in Kadapa have focused on health, government, and weather. Authorities are responding accordingly to ensure public welfare.", - "source": "The Hindu", - "publishedDate": "02/06/2025", - "extractedLocations": [ - "Kadapa" - ], - "districtMapping": "Kadapa", - "tags": [ - "health", - "government", - "weather" - ] - } -] \ No newline at end of file From 7c6e38bb6719203d6cc099971f9f7583626932a2 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 11:15:54 +0530 Subject: [PATCH 004/114] fix the table issue fix the table issue --- backend/Submit.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backend/Submit.py b/backend/Submit.py index 476ceea..7d00656 100644 --- a/backend/Submit.py +++ b/backend/Submit.py @@ -29,6 +29,9 @@ relevance_category TEXT, sentiment TEXT )""") +cursor.execute(""" + drop table if exists comprehend_jobs +""") cursor.execute(""" CREATE TABLE IF NOT EXISTS comprehend_jobs ( article_id TEXT, @@ -92,9 +95,7 @@ print("Entities Job Response:", entities_output) print("Sentiment Job Response:", sentiment_output) print("Key Phrases Job Response:", key_phrases_output) - cursor.execute(""" - drop table if exists comprehend_jobs - """) + print("Inserting into comprehend_jobs table") cursor.execute(""" From 87590123f64c997c45252acfe6f028dddecfbc4d Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 11:23:48 +0530 Subject: [PATCH 005/114] adding lambda deploy --- .github/workflows/lambda.yaml | 76 +++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 .github/workflows/lambda.yaml diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml new file mode 100644 index 0000000..7d40e4d --- /dev/null +++ b/.github/workflows/lambda.yaml @@ -0,0 +1,76 @@ +name: Deploy Lambdas + +on: + push: + branches: [main, backend_changes] + workflow_dispatch: + +env: + LAMBDA_DIR: backend + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install AWS CLI + run: pip install awscli + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 # Change this if needed + + - name: Deploy Python Lambda Files with Dependencies + run: | + for file in $LAMBDA_DIR/*.py; do + filename=$(basename -- "$file") + function_name="${filename%.*}" + build_dir="/tmp/${function_name}_build" + zip_file="/tmp/${function_name}.zip" + + echo "Creating build directory at $build_dir" + rm -rf "$build_dir" + mkdir -p "$build_dir" + + echo "Copying $file to $build_dir" + cp "$file" "$build_dir/" + + if [ -f "$LAMBDA_DIR/requirements.txt" ]; then + echo "Installing dependencies from requirements.txt" + pip install -r "$LAMBDA_DIR/requirements.txt" -t "$build_dir" + fi + + echo "Zipping contents into $zip_file" + cd "$build_dir" + zip -r "$zip_file" . > /dev/null + cd - + + echo "Checking if function $function_name exists..." + if aws lambda get-function --function-name "$function_name" > /dev/null 2>&1; then + echo "Function exists. Updating code..." + aws lambda update-function-code \ + --function-name "$function_name" \ + --zip-file "fileb://$zip_file" + else + echo "Function does not exist. Creating..." + aws lambda create-function \ + --function-name "$function_name" \ + --runtime python3.12 \ + --role "arn:aws:iam::269854564686:role/hackathon-lambda-role" \ + --handler "${function_name}.lambda_handler" \ + --zip-file "fileb://$zip_file" \ + --timeout 900 \ + --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 + fi + done From 767aed4c96672752f4c66f9b66c785189d38b385 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 11:25:23 +0530 Subject: [PATCH 006/114] adding lambda deploy --- .github/workflows/lambda.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 7d40e4d..e441a0c 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -27,8 +27,8 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v2 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-access-key-id: AKIAT5VEV4FHFQQJBZVX + aws-secret-access-key: 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 aws-region: us-east-1 # Change this if needed - name: Deploy Python Lambda Files with Dependencies From ec3f4a6bd9ba8fbe722dcba9a5895df3865fe7ce Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 11:27:23 +0530 Subject: [PATCH 007/114] adding lambda deploy --- backend/requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 28eb3db..315c9c9 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,5 +1,4 @@ numpy pandas boto3 -psycopg2 -tkinter \ No newline at end of file +psycopg2 \ No newline at end of file From 0fb86ea1dd260bee4e34e64fae413875f59ab9b3 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 11:37:54 +0530 Subject: [PATCH 008/114] adding lambda deploy --- .github/workflows/lambda.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index e441a0c..b986f58 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -59,18 +59,25 @@ jobs: echo "Checking if function $function_name exists..." if aws lambda get-function --function-name "$function_name" > /dev/null 2>&1; then echo "Function exists. Updating code..." + aws s3 cp "$zip_file" "s3://$DEPLOY_BUCKET/$function_name.zip" + aws lambda update-function-code \ --function-name "$function_name" \ - --zip-file "fileb://$zip_file" + --s3-bucket "hackathon-lambda-ap-ai-cyberark" \ + --s3-key "$function_name.zip" + else echo "Function does not exist. Creating..." + aws s3 cp "$zip_file" "s3://hackathon-lambda-ap-ai-cyberark/$function_name.zip" + aws lambda create-function \ --function-name "$function_name" \ --runtime python3.12 \ --role "arn:aws:iam::269854564686:role/hackathon-lambda-role" \ --handler "${function_name}.lambda_handler" \ - --zip-file "fileb://$zip_file" \ + --code S3Bucket="hackathon-lambda-ap-ai-cyberark",S3Key="$function_name.zip" \ --timeout 900 \ --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 + fi done From fddb4f5ddf98676bedc5f7045f721cfd0e2aeb5d Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 11:47:41 +0530 Subject: [PATCH 009/114] structure change structure change --- backend/JsontoCSV.py | 14 -------- backend/Submit.py | 2 -- backend/input.csv | 41 ----------------------- backend/{ => output_handler}/Transform.py | 25 +++++++++++++- backend/requirements.txt | 3 +- 5 files changed, 25 insertions(+), 60 deletions(-) delete mode 100644 backend/JsontoCSV.py delete mode 100644 backend/input.csv rename backend/{ => output_handler}/Transform.py (61%) diff --git a/backend/JsontoCSV.py b/backend/JsontoCSV.py deleted file mode 100644 index b510dec..0000000 --- a/backend/JsontoCSV.py +++ /dev/null @@ -1,14 +0,0 @@ -import json -import pandas as pd - -# Load the JSON file -with open("input_feed.json", "r", encoding="utf-8") as f: - data = json.load(f) - -# Normalize and convert lists to strings -df = pd.json_normalize(data) -df["extractedLocations"] = df["extractedLocations"].apply(lambda x: ", ".join(x)) -df["tags"] = df["tags"].apply(lambda x: ", ".join(x)) - -# Export to CSV -df.to_csv("news_feed_converted.csv", index=False) \ No newline at end of file diff --git a/backend/Submit.py b/backend/Submit.py index 7d00656..ca74254 100644 --- a/backend/Submit.py +++ b/backend/Submit.py @@ -95,8 +95,6 @@ print("Entities Job Response:", entities_output) print("Sentiment Job Response:", sentiment_output) print("Key Phrases Job Response:", key_phrases_output) - - print("Inserting into comprehend_jobs table") cursor.execute(""" INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) diff --git a/backend/input.csv b/backend/input.csv deleted file mode 100644 index ae4e274..0000000 --- a/backend/input.csv +++ /dev/null @@ -1,41 +0,0 @@ -article_id,title,body,source,publishedDate -1,"Visakhapatnam news update: Policy, heritage, environment","Recent events in Visakhapatnam have focused on policy, heritage, and environment. Authorities are responding accordingly to ensure public welfare.",New Indian Express,01/06/2025 -2,"Hyderabad news update: Weather, cybercrime, policy","Recent events in Hyderabad have focused on weather, cybercrime, and policy. Authorities are responding accordingly to ensure public welfare.",Eenadu,29/06/2025 -3,"Vizianagaram news update: Policy, sports, heritage","Recent events in Vizianagaram have focused on policy, sports, and heritage. Authorities are responding accordingly to ensure public welfare.",The Hindu,03/06/2025 -4,"Ongole news update: Education, policy, employment","Recent events in Ongole have focused on education, policy, and employment. Authorities are responding accordingly to ensure public welfare.",Andhra Jyothy,25/06/2025 -5,"Tirupati news update: Transport, elections, policy","Recent events in Tirupati have focused on transport, elections, and policy. Authorities are responding accordingly to ensure public welfare.",The Hindu,10/06/2025 -6,"Vizianagaram news update: Policy, sports, startup","Recent events in Vizianagaram have focused on policy, sports, and startup. Authorities are responding accordingly to ensure public welfare.",Deccan Chronicle,15/06/2025 -7,"Vijayawada news update: Startup, sports, infrastructure","Recent events in Vijayawada have focused on startup, sports, and infrastructure. Authorities are responding accordingly to ensure public welfare.",New Indian Express,25/06/2025 -8,"Nellore news update: Cybercrime, heritage, flood","Recent events in Nellore have focused on cybercrime, heritage, and flood. Authorities are responding accordingly to ensure public welfare.",Deccan Chronicle,16/06/2025 -9,"Vijayawada news update: Heritage, environment, flood","Recent events in Vijayawada have focused on heritage, environment, and flood. Authorities are responding accordingly to ensure public welfare.",New Indian Express,07/06/2025 -10,"Kurnool news update: Transport, crime, culture","Recent events in Kurnool have focused on transport, crime, and culture. Authorities are responding accordingly to ensure public welfare.",The Hindu,29/06/2025 -11,"Guntur news update: Elections, policy, health","Recent events in Guntur have focused on elections, policy, and health. Authorities are responding accordingly to ensure public welfare.",Sakshi,13/06/2025 -12,"Guntur news update: Infrastructure, environment, transport","Recent events in Guntur have focused on infrastructure, environment, and transport. Authorities are responding accordingly to ensure public welfare.",Deccan Chronicle,06/06/2025 -13,"Guntur news update: Cybercrime, energy, transport","Recent events in Guntur have focused on cybercrime, energy, and transport. Authorities are responding accordingly to ensure public welfare.",Sakshi,14/06/2025 -14,"Srikakulam news update: Environment, elections, government","Recent events in Srikakulam have focused on environment, elections, and government. Authorities are responding accordingly to ensure public welfare.",Times of India,06/06/2025 -15,"Ongole news update: Flood, government, weather","Recent events in Ongole have focused on flood, government, and weather. Authorities are responding accordingly to ensure public welfare.",Times of India,06/06/2025 -16,"Srikakulam news update: Government, transport, education","Recent events in Srikakulam have focused on government, transport, and education. Authorities are responding accordingly to ensure public welfare.",Sakshi,07/06/2025 -17,"Kurnool news update: Weather, government, infrastructure","Recent events in Kurnool have focused on weather, government, and infrastructure. Authorities are responding accordingly to ensure public welfare.",The Hindu,11/06/2025 -18,"Anantapur news update: Policy, health, weather","Recent events in Anantapur have focused on policy, health, and weather. Authorities are responding accordingly to ensure public welfare.",New Indian Express,15/06/2025 -19,"Vizianagaram news update: Heritage, transport, education","Recent events in Vizianagaram have focused on heritage, transport, and education. Authorities are responding accordingly to ensure public welfare.",Deccan Chronicle,08/06/2025 -20,"Tirupati news update: Heritage, weather, education","Recent events in Tirupati have focused on heritage, weather, and education. Authorities are responding accordingly to ensure public welfare.",Deccan Chronicle,18/06/2025 -21,"Kurnool news update: Startup, sports, policy","Recent events in Kurnool have focused on startup, sports, and policy. Authorities are responding accordingly to ensure public welfare.",The Hindu,08/06/2025 -22,"Vizianagaram news update: Technology, health, education","Recent events in Vizianagaram have focused on technology, health, and education. Authorities are responding accordingly to ensure public welfare.",The Hindu,03/06/2025 -23,"Kadapa news update: Health, weather, employment","Recent events in Kadapa have focused on health, weather, and employment. Authorities are responding accordingly to ensure public welfare.",New Indian Express,19/06/2025 -24,"Tirupati news update: Culture, startup, transport","Recent events in Tirupati have focused on culture, startup, and transport. Authorities are responding accordingly to ensure public welfare.",Eenadu,04/06/2025 -25,"Anantapur news update: Employment, culture, education","Recent events in Anantapur have focused on employment, culture, and education. Authorities are responding accordingly to ensure public welfare.",Eenadu,22/06/2025 -26,"Kurnool news update: Infrastructure, cybercrime, culture","Recent events in Kurnool have focused on infrastructure, cybercrime, and culture. Authorities are responding accordingly to ensure public welfare.",The Hindu,14/06/2025 -27,"Kurnool news update: Policy, culture, education","Recent events in Kurnool have focused on policy, culture, and education. Authorities are responding accordingly to ensure public welfare.",Eenadu,08/06/2025 -28,"Tirupati news update: Technology, employment, weather","Recent events in Tirupati have focused on technology, employment, and weather. Authorities are responding accordingly to ensure public welfare.",Sakshi,17/06/2025 -29,"Tirupati news update: Weather, infrastructure, elections","Recent events in Tirupati have focused on weather, infrastructure, and elections. Authorities are responding accordingly to ensure public welfare.",Times of India,21/06/2025 -30,"Tirupati news update: Sports, education, transport","Recent events in Tirupati have focused on sports, education, and transport. Authorities are responding accordingly to ensure public welfare.",Times of India,02/06/2025 -31,"Ongole news update: Weather, health, employment","Recent events in Ongole have focused on weather, health, and employment. Authorities are responding accordingly to ensure public welfare.",Deccan Chronicle,21/06/2025 -32,"Kadapa news update: Policy, culture, elections","Recent events in Kadapa have focused on policy, culture, and elections. Authorities are responding accordingly to ensure public welfare.",Sakshi,12/06/2025 -33,"Vizianagaram news update: Weather, crime, cybercrime","Recent events in Vizianagaram have focused on weather, crime, and cybercrime. Authorities are responding accordingly to ensure public welfare.",Sakshi,01/06/2025 -34,"Tirupati news update: Startup, employment, transport","Recent events in Tirupati have focused on startup, employment, and transport. Authorities are responding accordingly to ensure public welfare.",The Hindu,21/06/2025 -35,"Nellore news update: Environment, transport, technology","Recent events in Nellore have focused on environment, transport, and technology. Authorities are responding accordingly to ensure public welfare.",Sakshi,07/06/2025 -36,"Visakhapatnam news update: Technology, flood, crime","Recent events in Visakhapatnam have focused on technology, flood, and crime. Authorities are responding accordingly to ensure public welfare.",Andhra Jyothy,14/06/2025 -37,"Chittoor news update: Transport, weather, education","Recent events in Chittoor have focused on transport, weather, and education. Authorities are responding accordingly to ensure public welfare.",Eenadu,08/06/2025 -38,"Vijayawada news update: Health, policy, culture","Recent events in Vijayawada have focused on health, policy, and culture. Authorities are responding accordingly to ensure public welfare.",Eenadu,12/06/2025 -39,"Kadapa news update: Infrastructure, environment, weather","Recent events in Kadapa have focused on infrastructure, environment, and weather. Authorities are responding accordingly to ensure public welfare.",New Indian Express,09/06/2025 -40,"Kadapa news update: Health, government, weather","Recent events in Kadapa have focused on health, government, and weather. Authorities are responding accordingly to ensure public welfare.",The Hindu,02/06/2025 \ No newline at end of file diff --git a/backend/Transform.py b/backend/output_handler/Transform.py similarity index 61% rename from backend/Transform.py rename to backend/output_handler/Transform.py index c06b4b4..678fd5d 100644 --- a/backend/Transform.py +++ b/backend/output_handler/Transform.py @@ -34,9 +34,31 @@ def lambda_handler(event, context): article_id = row['article_id'] for result in results: if type == 'entities': + entity_array = [entity['Text'] for entity in result['Entities']] + location_array = [entity['Text'] for entity in result['Entities'] if entity['Type'] == 'LOCATION'] + relevance_array = [entity['Text'] for entity in result['Entities'] if entity['Type'] != 'LOCATION' and entity['Type'] != 'PERSON'] + if not entity_array: + ## get the entities from the entities table + cursor.execute("SELECT * FROM entities WHERE entity in (%s)",(tuple(entity_array),)) + entity_db_array = [row[0] for row in cursor.fetchall()] + entity_ids = [] + for entity in entity_array: + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity['entity'] == entity] + if not entity_in_db: + cursor.execute("INSERT INTO entities (entity) VALUES (%s) RETURNING id", (entity,)) + db_entity = cursor.fetchone() + db_entity = {'Id': db_entity[0], 'Text': entity} + else: + cursor.execute("SELECT * FROM entities WHERE entity = %s", (entity,)) + db_entity = cursor.fetchone() + db_entity = {'Id': db_entity[0], 'Text': entity} + for db_entity in entity_db_array: + + entity_array.append(db_entity) + entity_ids_array = [db_entity['Id'] for entity_db in entity_db_array if entity['Text'] in entity_array] location_mentions = ', '.join([entity['Text'] for entity in result['Entities'] if entity['Type'] == 'LOCATION']) officials_involved = ', '.join([entity['Text'] for entity in result['Entities'] if entity['Type'] == 'PERSON']) - relevance_category = ', '.join([entity['Text'] for entity in result['Entities'] if entity['Type'] == 'TITLE']) + relevance_category = ', '.join([entity['Text'] for entity in result['Entities'] if entity['Type'] != 'LOCATION' and entity['Type'] != 'PERSON']) if not location_mentions: cursor.execute("""update articles set location_mentions = %s where articles_id = %s""", (location_mentions, article_id)) if not officials_involved: @@ -54,4 +76,5 @@ def lambda_handler(event, context): cursor.close() ## delete the s3 object s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) + # s3.delete_object(Bucket=bucket, Key=key) conn.close() \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 28eb3db..315c9c9 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,5 +1,4 @@ numpy pandas boto3 -psycopg2 -tkinter \ No newline at end of file +psycopg2 \ No newline at end of file From 65c1b9b6ed7b008e8698f549f16f38220d9811df Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 11:57:22 +0530 Subject: [PATCH 010/114] adding lambda deploy --- .github/workflows/lambda.yaml | 83 ++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index b986f58..72f761d 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -2,11 +2,12 @@ name: Deploy Lambdas on: push: - branches: [main, backend_changes] + branches: [main] workflow_dispatch: env: LAMBDA_DIR: backend + DEPLOY_BUCKET: hackathon-lambda-ap-ai-cyberark jobs: deploy: @@ -29,55 +30,85 @@ jobs: with: aws-access-key-id: AKIAT5VEV4FHFQQJBZVX aws-secret-access-key: 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 - aws-region: us-east-1 # Change this if needed + aws-region: us-east-1 - - name: Deploy Python Lambda Files with Dependencies + - name: Deploy Lambda Functions run: | - for file in $LAMBDA_DIR/*.py; do - filename=$(basename -- "$file") - function_name="${filename%.*}" + for dir in $LAMBDA_DIR/*/; do + dir=${dir%*/} # remove trailing slash + function_name=$(basename "$dir") + entry_point="$dir/${function_name}.py" + + if [ ! -f "$entry_point" ]; then + echo "Skipping $function_name: $entry_point not found" + continue + fi + + echo "Packaging Lambda function: $function_name" + build_dir="/tmp/${function_name}_build" zip_file="/tmp/${function_name}.zip" - echo "Creating build directory at $build_dir" rm -rf "$build_dir" mkdir -p "$build_dir" - echo "Copying $file to $build_dir" - cp "$file" "$build_dir/" + # Copy main lambda file + cp "$entry_point" "$build_dir/" - if [ -f "$LAMBDA_DIR/requirements.txt" ]; then - echo "Installing dependencies from requirements.txt" - pip install -r "$LAMBDA_DIR/requirements.txt" -t "$build_dir" - fi + # Copy shared utils + for dir in $LAMBDA_DIR/*/; do + dir=${dir%*/} + function_name=$(basename "$dir") + entry_point="$dir/${function_name}.py" + + if [ ! -f "$entry_point" ]; then + echo "Skipping $function_name: $entry_point not found" + continue + fi + + echo "Packaging Lambda function: $function_name" + + build_dir="/tmp/${function_name}_build" + zip_file="/tmp/${function_name}.zip" + + rm -rf "$build_dir" + mkdir -p "$build_dir" - echo "Zipping contents into $zip_file" + # Copy main lambda code + cp "$entry_point" "$build_dir/" + + # āœ… Copy all top-level files in backend/ (excluding folders) + find "$LAMBDA_DIR" -maxdepth 1 -type f -exec cp {} "$build_dir/" \; + + # Install dependencies + if [ -f "$LAMBDA_DIR/requirements.txt" ]; then + pip install -r "$LAMBDA_DIR/requirements.txt" -t "$build_dir" + fi + + # Zip everything cd "$build_dir" zip -r "$zip_file" . > /dev/null cd - - echo "Checking if function $function_name exists..." - if aws lambda get-function --function-name "$function_name" > /dev/null 2>&1; then - echo "Function exists. Updating code..." - aws s3 cp "$zip_file" "s3://$DEPLOY_BUCKET/$function_name.zip" + # Upload to S3 + aws s3 cp "$zip_file" "s3://$DEPLOY_BUCKET/${function_name}.zip" + # Check if function exists + if aws lambda get-function --function-name "$function_name" > /dev/null 2>&1; then + echo "Updating Lambda function: $function_name" aws lambda update-function-code \ --function-name "$function_name" \ - --s3-bucket "hackathon-lambda-ap-ai-cyberark" \ - --s3-key "$function_name.zip" - + --s3-bucket "$DEPLOY_BUCKET" \ + --s3-key "${function_name}.zip" else - echo "Function does not exist. Creating..." - aws s3 cp "$zip_file" "s3://hackathon-lambda-ap-ai-cyberark/$function_name.zip" - + echo "Creating Lambda function: $function_name" aws lambda create-function \ --function-name "$function_name" \ --runtime python3.12 \ --role "arn:aws:iam::269854564686:role/hackathon-lambda-role" \ --handler "${function_name}.lambda_handler" \ - --code S3Bucket="hackathon-lambda-ap-ai-cyberark",S3Key="$function_name.zip" \ + --code S3Bucket="$DEPLOY_BUCKET",S3Key="${function_name}.zip" \ --timeout 900 \ --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 - fi done From 01759c0cdea3ba2e89307b9519ba458824ade2f9 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 11:58:15 +0530 Subject: [PATCH 011/114] adding lambda deploy --- .github/workflows/lambda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 72f761d..7e8db76 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -2,7 +2,7 @@ name: Deploy Lambdas on: push: - branches: [main] + branches: [main, backend_changes] workflow_dispatch: env: From 6832bda479f69082e1b4ed1dc6d3dfbc3e7ceb3a Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 11:59:55 +0530 Subject: [PATCH 012/114] adding lambda deploy --- .github/workflows/lambda.yaml | 47 +++++++++++------------------------ 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 7e8db76..2fab899 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -34,13 +34,13 @@ jobs: - name: Deploy Lambda Functions run: | - for dir in $LAMBDA_DIR/*/; do - dir=${dir%*/} # remove trailing slash + for dir in "$LAMBDA_DIR"/*/; do + dir=${dir%/} # remove trailing slash function_name=$(basename "$dir") entry_point="$dir/${function_name}.py" if [ ! -f "$entry_point" ]; then - echo "Skipping $function_name: $entry_point not found" + echo "Skipping $function_name: Entry point $entry_point not found." continue fi @@ -52,46 +52,27 @@ jobs: rm -rf "$build_dir" mkdir -p "$build_dir" - # Copy main lambda file + # Copy the Lambda's main .py file cp "$entry_point" "$build_dir/" - # Copy shared utils - for dir in $LAMBDA_DIR/*/; do - dir=${dir%*/} - function_name=$(basename "$dir") - entry_point="$dir/${function_name}.py" + # āœ… Copy all top-level files in backend (excluding subfolders) + find "$LAMBDA_DIR" -maxdepth 1 -type f -exec cp {} "$build_dir/" \; - if [ ! -f "$entry_point" ]; then - echo "Skipping $function_name: $entry_point not found" - continue - fi + # Remove requirements.txt from build_dir (only needed for pip install) + rm -f "$build_dir/requirements.txt" - echo "Packaging Lambda function: $function_name" - - build_dir="/tmp/${function_name}_build" - zip_file="/tmp/${function_name}.zip" - - rm -rf "$build_dir" - mkdir -p "$build_dir" - - # Copy main lambda code - cp "$entry_point" "$build_dir/" - - # āœ… Copy all top-level files in backend/ (excluding folders) - find "$LAMBDA_DIR" -maxdepth 1 -type f -exec cp {} "$build_dir/" \; + # Install dependencies into build_dir + if [ -f "$LAMBDA_DIR/requirements.txt" ]; then + pip install --no-cache-dir -r "$LAMBDA_DIR/requirements.txt" -t "$build_dir" + fi - # Install dependencies - if [ -f "$LAMBDA_DIR/requirements.txt" ]; then - pip install -r "$LAMBDA_DIR/requirements.txt" -t "$build_dir" - fi - - # Zip everything + # Create ZIP cd "$build_dir" zip -r "$zip_file" . > /dev/null cd - # Upload to S3 - aws s3 cp "$zip_file" "s3://$DEPLOY_BUCKET/${function_name}.zip" + aws s3 cp "$zip_file" "s3://${DEPLOY_BUCKET}/${function_name}.zip" # Check if function exists if aws lambda get-function --function-name "$function_name" > /dev/null 2>&1; then From ac19ff6946538f08d4874e39c41e3f695cb19df0 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 12:01:57 +0530 Subject: [PATCH 013/114] adding lambda deploy --- backend/output_handler/{Transform.py => output_handler.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename backend/output_handler/{Transform.py => output_handler.py} (100%) diff --git a/backend/output_handler/Transform.py b/backend/output_handler/output_handler.py similarity index 100% rename from backend/output_handler/Transform.py rename to backend/output_handler/output_handler.py From 1457c7aa20e0660a444d844046d1eb89d5d67e54 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 12:07:57 +0530 Subject: [PATCH 014/114] add entities code add entities code --- backend/Submit.py | 4 +- backend/output_handler/Transform.py | 60 +++++++++++++---------------- 2 files changed, 27 insertions(+), 37 deletions(-) diff --git a/backend/Submit.py b/backend/Submit.py index ca74254..e6b0e4f 100644 --- a/backend/Submit.py +++ b/backend/Submit.py @@ -24,9 +24,7 @@ body TEXT, source TEXT, published_date TEXT, - location_mentions TEXT, - officials_involved TEXT, - relevance_category TEXT, + entities TEXT, sentiment TEXT )""") cursor.execute(""" diff --git a/backend/output_handler/Transform.py b/backend/output_handler/Transform.py index 678fd5d..9a18aa3 100644 --- a/backend/output_handler/Transform.py +++ b/backend/output_handler/Transform.py @@ -3,6 +3,7 @@ import json import io from Utils import get_postgresql_connection +import datetime def lambda_handler(event, context): for record in event['Records']: @@ -34,47 +35,38 @@ def lambda_handler(event, context): article_id = row['article_id'] for result in results: if type == 'entities': - entity_array = [entity['Text'] for entity in result['Entities']] - location_array = [entity['Text'] for entity in result['Entities'] if entity['Type'] == 'LOCATION'] - relevance_array = [entity['Text'] for entity in result['Entities'] if entity['Type'] != 'LOCATION' and entity['Type'] != 'PERSON'] + entity_array = result['Entities'] if not entity_array: ## get the entities from the entities table - cursor.execute("SELECT * FROM entities WHERE entity in (%s)",(tuple(entity_array),)) - entity_db_array = [row[0] for row in cursor.fetchall()] - entity_ids = [] - for entity in entity_array: - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity['entity'] == entity] - if not entity_in_db: - cursor.execute("INSERT INTO entities (entity) VALUES (%s) RETURNING id", (entity,)) - db_entity = cursor.fetchone() - db_entity = {'Id': db_entity[0], 'Text': entity} - else: - cursor.execute("SELECT * FROM entities WHERE entity = %s", (entity,)) - db_entity = cursor.fetchone() - db_entity = {'Id': db_entity[0], 'Text': entity} - for db_entity in entity_db_array: - - entity_array.append(db_entity) - entity_ids_array = [db_entity['Id'] for entity_db in entity_db_array if entity['Text'] in entity_array] - location_mentions = ', '.join([entity['Text'] for entity in result['Entities'] if entity['Type'] == 'LOCATION']) - officials_involved = ', '.join([entity['Text'] for entity in result['Entities'] if entity['Type'] == 'PERSON']) - relevance_category = ', '.join([entity['Text'] for entity in result['Entities'] if entity['Type'] != 'LOCATION' and entity['Type'] != 'PERSON']) - if not location_mentions: - cursor.execute("""update articles set location_mentions = %s where articles_id = %s""", (location_mentions, article_id)) - if not officials_involved: - cursor.execute("""update articles set officials_involved = %s where articles_id = %s""", (officials_involved, article_id)) - if not relevance_category: - cursor.execute("""update articles set relevance_category = %s where articles_id = %s""", (relevance_category, article_id)) + add_entities_to_article(cursor, article_id, entity_array) + elif type == 'keyphrases': + keyPhrases_array = result['KeyPhrases'] + if not keyPhrases_array: + for keyPhrase in keyPhrases_array: + keyPhrase['Type'] = 'KeyPhrase' + add_entities_to_article(cursor, article_id, keyPhrases_array) elif type == 'sentiment': sentiment = result.get('Sentiment', 'NEUTRAL') if not sentiment: cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, article_id)) - elif type == 'keyphrases': - key_phrases = ', '.join(result.get('KeyPhrases', [])) - if not key_phrases: - cursor.execute("""update articles set key_phrases = %s where articles_id = %s""", (key_phrases, article_id)) cursor.close() ## delete the s3 object s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) # s3.delete_object(Bucket=bucket, Key=key) - conn.close() \ No newline at end of file + conn.close() + +def add_entities_to_article(cursor, article_id, entities): + entities_text = [entity['Text'] for entity in entities] + cursor.execute("SELECT * FROM entities WHERE entity in (%s)", (tuple(entities_text),)) + entity_db_array = [row[0] for row in cursor.fetchall()] + entity_ids = cursor.execute("SELECT entities FROM articles WHERE articles_id = %s", (article_id,)).cursor.fetchall() + for entity in entities: + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity['entity'] == entity['Text']] + if not entity_in_db: + current_time = datetime.datetime.utcnow() + cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + db_entity = cursor.fetchone() + entity_ids.append(db_entity[0]) + else: + entity_ids.append(entity_in_db[0]['Id']) + cursor.execute("""update articles set entities = %s where articles_id = %s""", (entity_ids, article_id)) \ No newline at end of file From 47278aa6c19224373d80433dfec2106aa567c721 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 12:15:34 +0530 Subject: [PATCH 015/114] password update password update --- backend/pg_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pg_config.json b/backend/pg_config.json index 596e176..9f44167 100644 --- a/backend/pg_config.json +++ b/backend/pg_config.json @@ -2,6 +2,6 @@ "host": "ap-ai-hackathon.cluster-cqt08oi8i1b6.us-east-1.rds.amazonaws.com", "database": "postgres", "user": "postgres", - "password": "AIHackathon", + "password": "!>.VZS)91jj5b0aer", "port": 5432 } \ No newline at end of file From 6d1a97d2d4877befbe380c41ddc57da799c1c35a Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 13:05:00 +0530 Subject: [PATCH 016/114] adding automation for layer --- .github/workflows/lambda.yaml | 3 +- .github/workflows/layer.yaml | 64 +++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/layer.yaml diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 2fab899..5b70bc8 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -90,6 +90,7 @@ jobs: --handler "${function_name}.lambda_handler" \ --code S3Bucket="$DEPLOY_BUCKET",S3Key="${function_name}.zip" \ --timeout 900 \ - --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 + --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 \ + --layers arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2 fi done diff --git a/.github/workflows/layer.yaml b/.github/workflows/layer.yaml new file mode 100644 index 0000000..65a84db --- /dev/null +++ b/.github/workflows/layer.yaml @@ -0,0 +1,64 @@ +name: Build Lambda Layer + +on: + push: + paths: + - 'backend/requirements.txt' + workflow_dispatch: + +jobs: + build-and-publish-layer: + runs-on: ubuntu-latest + + env: + LAYER_NAME: my-python-layer + PYTHON_VERSION: python3.9 + S3_KEY: layers/layer.zip + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - name: Install dependencies to python/ + run: | + mkdir -p python + pip install -r backend/requirements.txt --platform manylinux2014_x86_64 --only-binary=:all: -t python/ + + - name: Clean up unnecessary files + run: | + find python/ -type d -name "__pycache__" -exec rm -rf {} + + find python/ -type d -name "tests" -exec rm -rf {} + + find python/ -type f -name "*.pyc" -delete + + - name: Zip the layer + run: zip -r layer.zip python + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: AKIAT5VEV4FHFQQJBZVX + aws-secret-access-key: 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 + aws-region: us-east-1 + + - name: Upload zip to S3 + run: | + aws s3 cp layer.zip s3://hackathon-lambda-ap-ai-cyberark/${{ env.S3_KEY }} + + - name: Publish Lambda Layer from S3 + run: | + aws lambda publish-layer-version \ + --layer-name ${{ env.LAYER_NAME }} \ + --description "Dependencies from backend/requirements.txt" \ + --content S3Bucket=hackathon-lambda-ap-ai-cyberark,S3Key=${{ env.S3_KEY }} \ + --compatible-runtimes ${{ env.PYTHON_VERSION }} + + - name: Upload artifact (optional) + uses: actions/upload-artifact@v4 + with: + name: lambda-layer + path: layer.zip \ No newline at end of file From 0750d3d86430f71b035918bea3927d21df8d9169 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 13:07:42 +0530 Subject: [PATCH 017/114] adding automation for layer --- .github/workflows/lambda.yaml | 3 ++- .github/workflows/layer.yaml | 4 ++-- backend/requirements.txt | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 5b70bc8..c5cd1d4 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -80,7 +80,8 @@ jobs: aws lambda update-function-code \ --function-name "$function_name" \ --s3-bucket "$DEPLOY_BUCKET" \ - --s3-key "${function_name}.zip" + --s3-key "${function_name}.zip" \ + --layers arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2 else echo "Creating Lambda function: $function_name" aws lambda create-function \ diff --git a/.github/workflows/layer.yaml b/.github/workflows/layer.yaml index 65a84db..b4c49a1 100644 --- a/.github/workflows/layer.yaml +++ b/.github/workflows/layer.yaml @@ -12,7 +12,7 @@ jobs: env: LAYER_NAME: my-python-layer - PYTHON_VERSION: python3.9 + PYTHON_VERSION: python3.12 S3_KEY: layers/layer.zip steps: @@ -22,7 +22,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.12' - name: Install dependencies to python/ run: | diff --git a/backend/requirements.txt b/backend/requirements.txt index 315c9c9..b258e92 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,4 +1,4 @@ numpy pandas boto3 -psycopg2 \ No newline at end of file +psycopg2-binary==2.9.9 \ No newline at end of file From 36bce5319d842600e87d58a2d3553e78216ee23f Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 13:57:10 +0530 Subject: [PATCH 018/114] adding automation for layer --- .github/workflows/lambda.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index c5cd1d4..085ad51 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -81,7 +81,7 @@ jobs: --function-name "$function_name" \ --s3-bucket "$DEPLOY_BUCKET" \ --s3-key "${function_name}.zip" \ - --layers arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2 + --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2" else echo "Creating Lambda function: $function_name" aws lambda create-function \ @@ -92,6 +92,6 @@ jobs: --code S3Bucket="$DEPLOY_BUCKET",S3Key="${function_name}.zip" \ --timeout 900 \ --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 \ - --layers arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2 + --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2" fi done From a93ef9ab5ddfec9b16f5121c2c26a0105f272bb3 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 13:59:02 +0530 Subject: [PATCH 019/114] adding automation for lambda --- .github/workflows/lambda.yaml | 39 ++++++++++++++--------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 085ad51..222c30c 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -20,7 +20,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version: '3.12' - name: Install AWS CLI run: pip install awscli @@ -32,19 +32,19 @@ jobs: aws-secret-access-key: o56LCVEDcTPD8RgU2iWtz8SBklKa5DqQ6+nCYawf aws-region: us-east-1 - - name: Deploy Lambda Functions + - name: Deploy Lambda Functions (No dependencies) run: | for dir in "$LAMBDA_DIR"/*/; do - dir=${dir%/} # remove trailing slash + dir=${dir%/} function_name=$(basename "$dir") entry_point="$dir/${function_name}.py" if [ ! -f "$entry_point" ]; then - echo "Skipping $function_name: Entry point $entry_point not found." + echo "Skipping $function_name: $entry_point not found." continue fi - echo "Packaging Lambda function: $function_name" + echo "Packaging Lambda: $function_name" build_dir="/tmp/${function_name}_build" zip_file="/tmp/${function_name}.zip" @@ -52,38 +52,29 @@ jobs: rm -rf "$build_dir" mkdir -p "$build_dir" - # Copy the Lambda's main .py file + # Copy Lambda source file cp "$entry_point" "$build_dir/" - # āœ… Copy all top-level files in backend (excluding subfolders) - find "$LAMBDA_DIR" -maxdepth 1 -type f -exec cp {} "$build_dir/" \; + # Copy all top-level shared files from backend (excluding directories and requirements.txt) + find "$LAMBDA_DIR" -maxdepth 1 -type f ! -name "requirements.txt" -exec cp {} "$build_dir/" \; - # Remove requirements.txt from build_dir (only needed for pip install) - rm -f "$build_dir/requirements.txt" - - # Install dependencies into build_dir - if [ -f "$LAMBDA_DIR/requirements.txt" ]; then - pip install --no-cache-dir -r "$LAMBDA_DIR/requirements.txt" -t "$build_dir" - fi - - # Create ZIP + # Zip build directory cd "$build_dir" zip -r "$zip_file" . > /dev/null cd - - # Upload to S3 + # Upload zip to S3 aws s3 cp "$zip_file" "s3://${DEPLOY_BUCKET}/${function_name}.zip" - # Check if function exists + # Deploy Lambda if aws lambda get-function --function-name "$function_name" > /dev/null 2>&1; then - echo "Updating Lambda function: $function_name" + echo "Updating Lambda: $function_name" aws lambda update-function-code \ --function-name "$function_name" \ --s3-bucket "$DEPLOY_BUCKET" \ - --s3-key "${function_name}.zip" \ - --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2" + --s3-key "${function_name}.zip" else - echo "Creating Lambda function: $function_name" + echo "Creating Lambda: $function_name" aws lambda create-function \ --function-name "$function_name" \ --runtime python3.12 \ @@ -92,6 +83,6 @@ jobs: --code S3Bucket="$DEPLOY_BUCKET",S3Key="${function_name}.zip" \ --timeout 900 \ --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 \ - --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:2" + --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:3" # šŸ” Replace with your actual layer ARN fi done From 5be69247061e08b3bf81cca53f5ecff76890f375 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 14:03:45 +0530 Subject: [PATCH 020/114] Refactoring Refactoring --- backend/Sample.csv | 19 ---- backend/Submit/Data_parser.py | 42 +++++++++ backend/Submit/Submit.py | 78 ++++++++++++++++ backend/input_handler/input_handler.py | 90 +++++++++++++++++++ .../input_handler_test.py} | 0 backend/requirements.txt | 4 +- 6 files changed, 213 insertions(+), 20 deletions(-) delete mode 100644 backend/Sample.csv create mode 100644 backend/Submit/Data_parser.py create mode 100644 backend/Submit/Submit.py create mode 100644 backend/input_handler/input_handler.py rename backend/{Submit.py => input_handler/input_handler_test.py} (100%) diff --git a/backend/Sample.csv b/backend/Sample.csv deleted file mode 100644 index 7a84ed8..0000000 --- a/backend/Sample.csv +++ /dev/null @@ -1,19 +0,0 @@ -Text,Type -"Accident","MISHAP" -"Argument","MISHAP" -"fight","MISHAP" -"quarrel","MISHAP" -"Burn","MISHAP" -"Snatch","MISHAP" -"Murder","CRIME" -"RoadRage","CRIME" -"Rash driving","CRIME" -"Theft","CRIME" -"Burglary","CRIME" -"Cheat","CRIME" -"Stab","CRIME" -"Kill","CRIME" -"Hate Speech","CRIME" -"Hijack","CRIME" -"Beat","CRIME" -"Threat","CRIME" \ No newline at end of file diff --git a/backend/Submit/Data_parser.py b/backend/Submit/Data_parser.py new file mode 100644 index 0000000..0fa971f --- /dev/null +++ b/backend/Submit/Data_parser.py @@ -0,0 +1,42 @@ +import re +import csv +from docx import Document + +# Step 1: Read DOCX text +def read_docx_text(file_path): + doc = Document(file_path) + return "\n".join(p.text for p in doc.paragraphs) + +# Step 2: Extract articles using regex +def extract_articles(text): + pattern = re.compile( + r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', + re.DOTALL + ) + matches = pattern.findall(text) + articles = [] + for match in matches: + title = match[0].strip() + source = match[1].strip() + date_parts = match[2].strip().split("\n", 1) + date = date_parts[0].strip() + content = date_parts[1].strip() if len(date_parts) > 1 else "" + articles.append([title, source, date, content]) + return articles + +# Step 3: Save to CSV +def save_to_csv(articles, csv_path): + with open(csv_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["Title", "Source", "Date", "Body"]) + writer.writerows(articles) + +# Usage +docx_file = "D:\\Hackathon\\aihack-uc8\\backend\\MixedArticles _SameArticle_DiffSources.docx" # your input DOCX file +csv_file = "articles_parsed.csv" # output CSV file + +text = read_docx_text(docx_file) +articles = extract_articles(text) +save_to_csv(articles, csv_file) + +print(f"āœ… Extracted {len(articles)} articles to {csv_file}") diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py new file mode 100644 index 0000000..15c095d --- /dev/null +++ b/backend/Submit/Submit.py @@ -0,0 +1,78 @@ +import uuid +from Utils import get_postgresql_connection +from fastapi import FastAPI, UploadFile, File +from fastapi.responses import JSONResponse +from docx import Document +import csv +import io +import re +import boto3 + +app = FastAPI() + +s3 = boto3.client('s3') +BUCKET_NAME = 'awstraindata' + +@app.post("/upload/") +async def upload_docx(file: UploadFile = File(...)): + if not file.filename.endswith(".docx"): + return JSONResponse(status_code=400, content={"error": "Only .docx files are supported"}) + + try: + s3_urls = [] + articles = extract_articles(file.file) + conn = get_postgresql_connection() + cursor = conn.cursor() + cursor.execute("drop table if exists articles") + cursor.execute("""CREATE TABLE IF NOT EXISTS articles ( + articles_id TEXT, + title TEXT, + body TEXT, + source TEXT, + published_date TEXT, + entities TEXT, + sentiment TEXT + )""") + for article in articles: + output_csv = io.StringIO() + writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) + writer.writeheader() + writer.writerow(article) + # Generate unique filename + csv_filename = f"/input/articles-{uuid.uuid4()}.csv" + articles_id = str(uuid.uuid4()) # Generate a unique ID for each article + cursor.execute(""" + INSERT INTO articles (articles_id, title, body, source, published_date) + VALUES (%s, %s, %s, %s, %s)""", (articles_id, article[0], article[1], article[2], article[3])) + # Upload to S3 + s3.put_object( + Bucket=BUCKET_NAME, + Key=csv_filename, + Body=output_csv.getvalue(), + ContentType='text/csv' + ) + s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" + s3_urls.append(s3_url) + return {"status": "success", "count": len(articles), "data": articles, "s3_urls": s3_urls} + + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + +def extract_articles(file_stream): + doc = Document(file_stream) + text = "\n".join(p.text for p in doc.paragraphs) + pattern = re.compile( + r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', + re.DOTALL + ) + matches = pattern.findall(text) + articles = [] + for match in matches: + title = match[0].strip() + source = match[1].strip() + date_parts = match[2].strip().split("\n", 1) + date = date_parts[0].strip() + content = date_parts[1].strip() if len(date_parts) > 1 else "" + articles.append([title, source, date, content]) + return articles \ No newline at end of file diff --git a/backend/input_handler/input_handler.py b/backend/input_handler/input_handler.py new file mode 100644 index 0000000..d0a270d --- /dev/null +++ b/backend/input_handler/input_handler.py @@ -0,0 +1,90 @@ +import csv +import io +import pandas as pd +import boto3 +import time +import uuid +from Utils import get_postgresql_connection +def lambda_handler(event, context): + comprehend = boto3.client('comprehend') + s3 = boto3.client('s3') + role_arn = 'arn:aws:iam::269854564686:role/hackathon-comprehend-role' + bucket_name = 'awstraindata' + conn = get_postgresql_connection() + cursor = conn.cursor() + for record in event['Records']: + print(f"New record: {record}") + bucket = record['s3']['bucket']['name'] + key = record['s3']['object']['key'] + # Download the file object + input_csv_object = s3.get_object(Bucket=bucket_name, Key=key) + cursor.execute(""" + CREATE TABLE IF NOT EXISTS comprehend_jobs ( + article_id TEXT, + input_s3_uri TEXT, + entities_path TEXT, + sentiment_path TEXT, + key_phrases_path TEXT + ) + """) + input_csv = pd.read_csv(io.BytesIO(input_csv_object['Body'].read())) + for index, row in input_csv.iterrows(): + print(f"Processing row {index}: {row}") + articles_id = str(uuid.uuid4()) # Generate a unique ID for each article + cursor.execute(""" + INSERT INTO articles (articles_id, title, body, source, published_date) + VALUES (%s, %s, %s, %s, %s)""", (articles_id, row[1], row[2], row[3], row[4])) + # Convert to CSV in-memory + csv_buffer = io.StringIO() + writer = csv.writer(csv_buffer) + # writer.writerow(row.headers) # Write header + writer.writerow(row) + s3_path = 'input/' + articles_id + '.csv' + s3_uri = 's3://' + bucket_name + '/' + s3_path + s3.put_object( + Bucket=bucket_name, + Key=s3_path, # adjust as needed + Body=csv_buffer.getvalue(), + ContentType='text/csv' + ) + entities_job = comprehend.start_entities_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) + ) + result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) + entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] + + # SENTIMENT detection job + sentiment_job = comprehend.start_sentiment_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) + ) + res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) + sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] + + # KEY PHRASES detection job + phrases_job = comprehend.start_key_phrases_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) + ) + res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) + key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] + print("Entities Job Response:", entities_output) + print("Sentiment Job Response:", sentiment_output) + print("Key Phrases Job Response:", key_phrases_output) + print("Inserting into comprehend_jobs table") + cursor.execute(""" + INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) + VALUES (%s, %s, %s, %s, %s)""", (articles_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) + conn.commit() + cursor.close() + conn.close() \ No newline at end of file diff --git a/backend/Submit.py b/backend/input_handler/input_handler_test.py similarity index 100% rename from backend/Submit.py rename to backend/input_handler/input_handler_test.py diff --git a/backend/requirements.txt b/backend/requirements.txt index b258e92..4b86188 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,4 +1,6 @@ numpy pandas boto3 -psycopg2-binary==2.9.9 \ No newline at end of file +psycopg2-binary==2.9.9 +python-docx +fastapi[all] \ No newline at end of file From e6d489823087ee390d1004440628f50670a1372c Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 14:17:44 +0530 Subject: [PATCH 021/114] error handling error handling --- backend/input_handler/Utils.py | 25 ++++++ backend/input_handler/input_handler.py | 2 +- backend/output_handler/output_handler.py | 99 +++++++++++++----------- 3 files changed, 79 insertions(+), 47 deletions(-) create mode 100644 backend/input_handler/Utils.py diff --git a/backend/input_handler/Utils.py b/backend/input_handler/Utils.py new file mode 100644 index 0000000..41ed2b4 --- /dev/null +++ b/backend/input_handler/Utils.py @@ -0,0 +1,25 @@ +import json +import psycopg2 +def get_postgresql_connection(): + '''get the creds from local config''' + + """ + Establish a connection to a PostgreSQL database. + + Parameters: + host (str): The hostname of the PostgreSQL server. + database (str): The name of the database to connect to. + user (str): The username to connect with. + password (str): The password for the user. + + Returns: + psycopg2.extensions.connection: A connection object to the PostgreSQL database. + """ + try: + with open("pg_config.json") as f: + config = json.load(f) + conn = psycopg2.connect(**config) + return conn + except psycopg2.Error as e: + print("Error connecting to PostgreSQL database:", e) + return None \ No newline at end of file diff --git a/backend/input_handler/input_handler.py b/backend/input_handler/input_handler.py index d0a270d..50addfe 100644 --- a/backend/input_handler/input_handler.py +++ b/backend/input_handler/input_handler.py @@ -4,7 +4,7 @@ import boto3 import time import uuid -from Utils import get_postgresql_connection +from ..Utils import get_postgresql_connection def lambda_handler(event, context): comprehend = boto3.client('comprehend') s3 = boto3.client('s3') diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index 9a18aa3..5b8b167 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -6,54 +6,61 @@ import datetime def lambda_handler(event, context): - for record in event['Records']: - print(f"New record: {record}") - bucket = record['s3']['bucket']['name'] - key = record['s3']['object']['key'] - conn = get_postgresql_connection() - s3 = boto3.client('s3') - obj = s3.get_object(Bucket=bucket, Key=key) - tar_bytes = io.BytesIO(obj['Body'].read()) + try: + for record in event['Records']: + print(f"New record: {record}") + bucket = record['s3']['bucket']['name'] + key = record['s3']['object']['key'] + conn = get_postgresql_connection() + s3 = boto3.client('s3') + obj = s3.get_object(Bucket=bucket, Key=key) + tar_bytes = io.BytesIO(obj['Body'].read()) - # Extract .json inside the tar.gz - with tarfile.open(fileobj=tar_bytes, mode='r:gz') as tar: - for member in tar.getmembers(): - if member.name == "output" and member.isfile(): - file = tar.extractfile(member) - results = json.load(file) - print(f"Extracted JSON: {results}") - break + # Extract .json inside the tar.gz + with tarfile.open(fileobj=tar_bytes, mode='r:gz') as tar: + for member in tar.getmembers(): + if member.name == "output" and member.isfile(): + file = tar.extractfile(member) + results = json.load(file) + print(f"Extracted JSON: {results}") + break - if not results: - folderSplit = key.split('/') - type = folderSplit[0] - cursor = conn.cursor() - query = "SELECT * FROM comprehend_jobs WHERE entities_path = %s or sentiment_path = %s or key_phrases_path = %s" - cursor.execute(query, (key, key, key)) - row = cursor.fetchone() - if row: - article_id = row['article_id'] - for result in results: - if type == 'entities': - entity_array = result['Entities'] - if not entity_array: - ## get the entities from the entities table - add_entities_to_article(cursor, article_id, entity_array) - elif type == 'keyphrases': - keyPhrases_array = result['KeyPhrases'] - if not keyPhrases_array: - for keyPhrase in keyPhrases_array: - keyPhrase['Type'] = 'KeyPhrase' - add_entities_to_article(cursor, article_id, keyPhrases_array) - elif type == 'sentiment': - sentiment = result.get('Sentiment', 'NEUTRAL') - if not sentiment: - cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, article_id)) - cursor.close() - ## delete the s3 object - s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) - # s3.delete_object(Bucket=bucket, Key=key) - conn.close() + if not results: + folderSplit = key.split('/') + type = folderSplit[0] + cursor = conn.cursor() + query = "SELECT * FROM comprehend_jobs WHERE entities_path = %s or sentiment_path = %s or key_phrases_path = %s" + cursor.execute(query, (key, key, key)) + row = cursor.fetchone() + if row: + article_id = row['article_id'] + for result in results: + if type == 'entities': + entity_array = result['Entities'] + if not entity_array: + ## get the entities from the entities table + add_entities_to_article(cursor, article_id, entity_array) + elif type == 'keyphrases': + keyPhrases_array = result['KeyPhrases'] + if not keyPhrases_array: + for keyPhrase in keyPhrases_array: + keyPhrase['Type'] = 'KeyPhrase' + add_entities_to_article(cursor, article_id, keyPhrases_array) + elif type == 'sentiment': + sentiment = result.get('Sentiment', 'NEUTRAL') + if not sentiment: + cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, article_id)) + cursor.close() + ## delete the s3 object + s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) + # s3.delete_object(Bucket=bucket, Key=key) + conn.close() + except Exception as e: + print(f"Error processing record: {e}") + return { + 'statusCode': 500, + 'body': json.dumps({'error': str(e)}) + } def add_entities_to_article(cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] From b7254c407b3da63e3db3755214a03e777122f705 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 14:31:54 +0530 Subject: [PATCH 022/114] database change database change --- backend/output_handler/output_handler.py | 39 +++++++++++++++++++----- backend/pg_config.json | 4 +-- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index 5b8b167..aa019fe 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -15,7 +15,7 @@ def lambda_handler(event, context): s3 = boto3.client('s3') obj = s3.get_object(Bucket=bucket, Key=key) tar_bytes = io.BytesIO(obj['Body'].read()) - + print(f"Processing file: {key}") # Extract .json inside the tar.gz with tarfile.open(fileobj=tar_bytes, mode='r:gz') as tar: for member in tar.getmembers(): @@ -24,7 +24,7 @@ def lambda_handler(event, context): results = json.load(file) print(f"Extracted JSON: {results}") break - + print(f"Results: {results}") if not results: folderSplit = key.split('/') type = folderSplit[0] @@ -53,7 +53,6 @@ def lambda_handler(event, context): cursor.close() ## delete the s3 object s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) - # s3.delete_object(Bucket=bucket, Key=key) conn.close() except Exception as e: print(f"Error processing record: {e}") @@ -66,14 +65,40 @@ def add_entities_to_article(cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] cursor.execute("SELECT * FROM entities WHERE entity in (%s)", (tuple(entities_text),)) entity_db_array = [row[0] for row in cursor.fetchall()] - entity_ids = cursor.execute("SELECT entities FROM articles WHERE articles_id = %s", (article_id,)).cursor.fetchall() + location_mentions = [] + officials_involved = [] + relevance_category = [] for entity in entities: entity_in_db = [db_entity for db_entity in entity_db_array if db_entity['entity'] == entity['Text']] if not entity_in_db: current_time = datetime.datetime.utcnow() cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) db_entity = cursor.fetchone() - entity_ids.append(db_entity[0]) + if entity['Type'] == 'Location': + location_mentions.append(db_entity[0]) + elif entity['Type'] == 'Person': + officials_involved.append(db_entity[0]) + elif entity['Type'] == 'KeyPhrases': + relevance_category.append(db_entity[0]) + else: + print(f"Unknown entity type: {entity['Type']}") else: - entity_ids.append(entity_in_db[0]['Id']) - cursor.execute("""update articles set entities = %s where articles_id = %s""", (entity_ids, article_id)) \ No newline at end of file + if entity['Type'] == 'Location': + location_mentions.append(entity_in_db[0]['Id']) + elif entity['Type'] == 'Person': + officials_involved.append(entity_in_db[0]['Id']) + elif entity['Type'] == 'KeyPhrases': + relevance_category.append(entity_in_db[0]['Id']) + else: + print(f"Unknown entity type: {entity['Type']}") + if location_mentions: + location_mentions = ','.join(map(str, location_mentions)) + cursor.execute("""update articles set location_mentions = %s where articles_id = %s""", (location_mentions, article_id)) + + if officials_involved: + officials_involved = ','.join(map(str, officials_involved)) + cursor.execute("""update articles set officials_involved = %s where articles_id = %s""", (officials_involved, article_id)) + + if relevance_category: + relevance_category = ','.join(map(str, relevance_category)) + cursor.execute("""update articles set relevance_category = %s where articles_id = %s""", (relevance_category, article_id)) \ No newline at end of file diff --git a/backend/pg_config.json b/backend/pg_config.json index 9f44167..a9b4308 100644 --- a/backend/pg_config.json +++ b/backend/pg_config.json @@ -1,6 +1,6 @@ { - "host": "ap-ai-hackathon.cluster-cqt08oi8i1b6.us-east-1.rds.amazonaws.com", - "database": "postgres", + "host": "hackathon-ai-ap.cluster-cqt08oi8i1b6.us-east-1.rds.amazonaws.com", + "database": "bedrock_integration", "user": "postgres", "password": "!>.VZS)91jj5b0aer", "port": 5432 From 2847be3e5ec0d8e3f79e41b9eb0ec69c9809c8ca Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 14:33:28 +0530 Subject: [PATCH 023/114] password update --- backend/pg_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pg_config.json b/backend/pg_config.json index a9b4308..8fb6344 100644 --- a/backend/pg_config.json +++ b/backend/pg_config.json @@ -2,6 +2,6 @@ "host": "hackathon-ai-ap.cluster-cqt08oi8i1b6.us-east-1.rds.amazonaws.com", "database": "bedrock_integration", "user": "postgres", - "password": "!>.VZS)91jj5b0aer", + "password": "3D6[~771pd5|pkF03dBeL.5#IZ5?", "port": 5432 } \ No newline at end of file From 195f59d085f2b054d491f582afba32b879127cf6 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 14:44:08 +0530 Subject: [PATCH 024/114] debug log --- backend/output_handler/output_handler.py | 1 + backend/pg_config.json | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index aa019fe..8473ab9 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -11,6 +11,7 @@ def lambda_handler(event, context): print(f"New record: {record}") bucket = record['s3']['bucket']['name'] key = record['s3']['object']['key'] + print(f"Processing file from bucket: {bucket}, key: {key}") conn = get_postgresql_connection() s3 = boto3.client('s3') obj = s3.get_object(Bucket=bucket, Key=key) diff --git a/backend/pg_config.json b/backend/pg_config.json index 8fb6344..64ca955 100644 --- a/backend/pg_config.json +++ b/backend/pg_config.json @@ -1,6 +1,6 @@ { "host": "hackathon-ai-ap.cluster-cqt08oi8i1b6.us-east-1.rds.amazonaws.com", - "database": "bedrock_integration", + "database": "postgres", "user": "postgres", "password": "3D6[~771pd5|pkF03dBeL.5#IZ5?", "port": 5432 From e7e3f4f395304982a3227a93417cf56eab8dba38 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 14:52:20 +0530 Subject: [PATCH 025/114] debugging debugging --- backend/output_handler/output_handler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index 8473ab9..0c09c3e 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -19,9 +19,15 @@ def lambda_handler(event, context): print(f"Processing file: {key}") # Extract .json inside the tar.gz with tarfile.open(fileobj=tar_bytes, mode='r:gz') as tar: + print(f"Extracting files from tar: {key}") for member in tar.getmembers(): + print(f"Found member: {member.name}") if member.name == "output" and member.isfile(): + print(f"Extracting JSON file: {member.name}") file = tar.extractfile(member) + if not file: + print(f"File {member.name} not found in tar.") + continue results = json.load(file) print(f"Extracted JSON: {results}") break From 2ff92b3cedefe61ad530c9f2f3f4086860e2d65e Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 15:05:00 +0530 Subject: [PATCH 026/114] Added s3 debug log Added s3 debug log --- backend/output_handler/output_handler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index 0c09c3e..54e7d08 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -14,7 +14,9 @@ def lambda_handler(event, context): print(f"Processing file from bucket: {bucket}, key: {key}") conn = get_postgresql_connection() s3 = boto3.client('s3') + print(f"Connecting to S3 bucket: {bucket}") obj = s3.get_object(Bucket=bucket, Key=key) + print(f"Downloaded object from S3: {key}") tar_bytes = io.BytesIO(obj['Body'].read()) print(f"Processing file: {key}") # Extract .json inside the tar.gz From 369b6b4d5258a3f0d2a003d913911111c5e9b657 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 15:40:03 +0530 Subject: [PATCH 027/114] adding new lambda --- backend/clustering_service/clustering_service.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 backend/clustering_service/clustering_service.py diff --git a/backend/clustering_service/clustering_service.py b/backend/clustering_service/clustering_service.py new file mode 100644 index 0000000..76eda3b --- /dev/null +++ b/backend/clustering_service/clustering_service.py @@ -0,0 +1,8 @@ +import json + +def lambda_handler(event, context): + # TODO implement + return { + 'statusCode': 200, + 'body': json.dumps('Hello from Lambda!') + } \ No newline at end of file From 2a510bbc45ea5a72fe9587e83b427033f41d60f9 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 15:52:14 +0530 Subject: [PATCH 028/114] adding api code --- .github/workflows/api.yaml | 30 +++++++++ api/deploy-api.py | 133 +++++++++++++++++++++++++++++++++++++ api/get-value.json | 10 +++ api/upload.json | 10 +++ 4 files changed, 183 insertions(+) create mode 100644 .github/workflows/api.yaml create mode 100644 api/deploy-api.py create mode 100644 api/get-value.json create mode 100644 api/upload.json diff --git a/.github/workflows/api.yaml b/.github/workflows/api.yaml new file mode 100644 index 0000000..1c3153b --- /dev/null +++ b/.github/workflows/api.yaml @@ -0,0 +1,30 @@ +name: Deploy API Gateway + +on: + push: + branches: [main, backend_changes] + +jobs: + deploy-api: + runs-on: ubuntu-latest + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install boto3 + - name: Run API deployment script + env: + AWS_ACCESS_KEY_ID: AKIAT5VEV4FHFQQJBZVX + AWS_SECRET_ACCESS_KEY: o56LCVEDcTPD8RgU2iWtz8SBklKa5DqQ6+nCYawf + AWS_REGION: us-east-1 # or your region + run: | + python3 api/deploy-api.py \ No newline at end of file diff --git a/api/deploy-api.py b/api/deploy-api.py new file mode 100644 index 0000000..08298a0 --- /dev/null +++ b/api/deploy-api.py @@ -0,0 +1,133 @@ +import boto3 +import json +import os + +REGION = "us-east-1" +STAGE = "prod" +ACCOUNT_ID = "269854564686" + +apigateway = boto3.client("apigateway", region_name=REGION) +lambda_client = boto3.client("lambda", region_name=REGION) + +def get_or_create_api(api_name): + apis = apigateway.get_rest_apis()["items"] + for api in apis: + if api["name"] == api_name: + print(f"Found API: {api_name}") + return api["id"] + + print(f"Creating API: {api_name}") + response = apigateway.create_rest_api(name=api_name) + return response["id"] + +def get_or_create_resource(api_id, resource_path): + resources = apigateway.get_resources(restApiId=api_id)["items"] + root_id = next(item["id"] for item in resources if item["path"] == "/") + + for res in resources: + if res["path"] == f"/{resource_path}": + print(f"Found resource: /{resource_path}") + return res["id"] + + print(f"Creating resource: /{resource_path}") + response = apigateway.create_resource( + restApiId=api_id, + parentId=root_id, + pathPart=resource_path + ) + return response["id"] + +def method_exists(api_id, resource_id, http_method): + try: + apigateway.get_method( + restApiId=api_id, + resourceId=resource_id, + httpMethod=http_method + ) + return True + except apigateway.exceptions.NotFoundException: + return False + +def add_lambda_permission(lambda_name, api_id, method, path): + statement_id = f"{lambda_name.lower()}-{method.lower()}" + try: + lambda_client.add_permission( + FunctionName=lambda_name, + StatementId=statement_id, + Action="lambda:InvokeFunction", + Principal="apigateway.amazonaws.com", + SourceArn=f"arn:aws:execute-api:{REGION}:{ACCOUNT_ID}:{api_id}/*/{method}/{path}" + ) + print(f"Added permission to Lambda {lambda_name} for method {method} /{path}") + except lambda_client.exceptions.ResourceConflictException: + # Permission already exists + print(f"Permission already exists for Lambda {lambda_name} and method {method} /{path}") + +def setup_method(api_id, resource_id, method_def, path): + method = method_def["httpMethod"].upper() + lambda_name = method_def["lambdaFunctionName"] + auth_type = method_def.get("authorizationType", "NONE") + lambda_arn = f"arn:aws:lambda:{REGION}:{ACCOUNT_ID}:function:{lambda_name}" + + if method_exists(api_id, resource_id, method): + print(f"Method {method} already exists for /{path}, skipping method creation.") + else: + print(f"Creating method {method} for /{path}") + apigateway.put_method( + restApiId=api_id, + resourceId=resource_id, + httpMethod=method, + authorizationType=auth_type + ) + + print(f"Setting integration for {method} /{path}") + apigateway.put_integration( + restApiId=api_id, + resourceId=resource_id, + httpMethod=method, + type="AWS_PROXY", + integrationHttpMethod="POST", + uri=f"arn:aws:apigateway:{REGION}:lambda:path/2015-03-31/functions/{lambda_arn}/invocations" + ) + + add_lambda_permission(lambda_name, api_id, method, path) + +def deploy_api(api_id): + print(f"Deploying API {api_id} to stage: {STAGE}") + apigateway.create_deployment( + restApiId=api_id, + stageName=STAGE + ) + +def main(): + # Use script folder as working directory to find JSON files + script_dir = os.path.dirname(os.path.abspath(__file__)) + + deploy_apis = set() + + # Loop all JSON files in current folder + for file in os.listdir(script_dir): + if not file.endswith(".json"): + continue + + json_path = os.path.join(script_dir, file) + with open(json_path) as f: + config = json.load(f) + + api_name = config["apiName"] + resource_path = config["resourcePath"] + method_def = config["method"] + should_deploy = config.get("deploy", False) + + api_id = get_or_create_api(api_name) + resource_id = get_or_create_resource(api_id, resource_path) + setup_method(api_id, resource_id, method_def, resource_path) + + if should_deploy: + deploy_apis.add(api_id) + + for api_id in deploy_apis: + deploy_api(api_id) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/api/get-value.json b/api/get-value.json new file mode 100644 index 0000000..04ffc52 --- /dev/null +++ b/api/get-value.json @@ -0,0 +1,10 @@ +{ + "apiName": "hackathon", + "resourcePath": "get-value", + "method": { + "httpMethod": "GET", + "authorizationType": "NONE", + "lambdaFunctionName": "Submit" + }, + "deploy": true +} \ No newline at end of file diff --git a/api/upload.json b/api/upload.json new file mode 100644 index 0000000..acce321 --- /dev/null +++ b/api/upload.json @@ -0,0 +1,10 @@ +{ + "apiName": "hackathon", + "resourcePath": "upload", + "method": { + "httpMethod": "POST", + "authorizationType": "NONE", + "lambdaFunctionName": "ANother-test" + }, + "deploy": true +} \ No newline at end of file From 6fb438cdda6bf1797a574c68ad781c8b64201639 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 16:05:11 +0530 Subject: [PATCH 029/114] updated code updated code --- backend/output_handler/output_handler.py | 90 +++++++++++++++--------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index 54e7d08..f759b07 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -5,6 +5,8 @@ from Utils import get_postgresql_connection import datetime + + def lambda_handler(event, context): try: for record in event['Records']: @@ -30,35 +32,39 @@ def lambda_handler(event, context): if not file: print(f"File {member.name} not found in tar.") continue - results = json.load(file) - print(f"Extracted JSON: {results}") + result = json.load(file) + print(f"Extracted JSON: {result}") break - print(f"Results: {results}") - if not results: + print(f"Results: {result}") + if result: + print(f"Results found in the file: {key}") folderSplit = key.split('/') - type = folderSplit[0] + type = folderSplit[1] cursor = conn.cursor() query = "SELECT * FROM comprehend_jobs WHERE entities_path = %s or sentiment_path = %s or key_phrases_path = %s" cursor.execute(query, (key, key, key)) row = cursor.fetchone() + print(f"Row found: {row}") + print(f"Type of analysis: {type}") if row: - article_id = row['article_id'] - for result in results: - if type == 'entities': - entity_array = result['Entities'] - if not entity_array: - ## get the entities from the entities table - add_entities_to_article(cursor, article_id, entity_array) - elif type == 'keyphrases': - keyPhrases_array = result['KeyPhrases'] - if not keyPhrases_array: - for keyPhrase in keyPhrases_array: - keyPhrase['Type'] = 'KeyPhrase' - add_entities_to_article(cursor, article_id, keyPhrases_array) - elif type == 'sentiment': - sentiment = result.get('Sentiment', 'NEUTRAL') - if not sentiment: - cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, article_id)) + article_id = row[0] + print(f"Article ID: {article_id}") + if type == 'entities': + entity_array = result['Entities'] + if entity_array: + ## get the entities from the entities table + add_entities_to_article(cursor, article_id, entity_array) + elif type == 'keyphrases': + keyPhrases_array = result['KeyPhrases'] + if keyPhrases_array: + for keyPhrase in keyPhrases_array: + keyPhrase['Type'] = 'KeyPhrase' + add_entities_to_article(cursor, article_id, keyPhrases_array) + elif type == 'sentiment': + sentiment = result.get('Sentiment', 'NEUTRAL') + if sentiment: + cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, article_id)) + conn.commit() cursor.close() ## delete the s3 object s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) @@ -72,34 +78,30 @@ def lambda_handler(event, context): def add_entities_to_article(cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] - cursor.execute("SELECT * FROM entities WHERE entity in (%s)", (tuple(entities_text),)) + cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) entity_db_array = [row[0] for row in cursor.fetchall()] location_mentions = [] officials_involved = [] - relevance_category = [] + relevance_category = cursor.execute("SELECT * FROM articles WHERE article_id in %s", (article_id,)).fetchall() for entity in entities: entity_in_db = [db_entity for db_entity in entity_db_array if db_entity['entity'] == entity['Text']] if not entity_in_db: current_time = datetime.datetime.utcnow() cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) db_entity = cursor.fetchone() - if entity['Type'] == 'Location': + if entity['Type'] == 'LOCATION': location_mentions.append(db_entity[0]) - elif entity['Type'] == 'Person': + elif entity['Type'] == 'PERSON': officials_involved.append(db_entity[0]) - elif entity['Type'] == 'KeyPhrases': - relevance_category.append(db_entity[0]) else: - print(f"Unknown entity type: {entity['Type']}") + relevance_category.append(db_entity[0]) else: - if entity['Type'] == 'Location': + if entity['Type'] == 'LOCATION': location_mentions.append(entity_in_db[0]['Id']) - elif entity['Type'] == 'Person': + elif entity['Type'] == 'PERSON': officials_involved.append(entity_in_db[0]['Id']) - elif entity['Type'] == 'KeyPhrases': - relevance_category.append(entity_in_db[0]['Id']) else: - print(f"Unknown entity type: {entity['Type']}") + relevance_category.append(entity_in_db[0]['Id']) if location_mentions: location_mentions = ','.join(map(str, location_mentions)) cursor.execute("""update articles set location_mentions = %s where articles_id = %s""", (location_mentions, article_id)) @@ -110,4 +112,22 @@ def add_entities_to_article(cursor, article_id, entities): if relevance_category: relevance_category = ','.join(map(str, relevance_category)) - cursor.execute("""update articles set relevance_category = %s where articles_id = %s""", (relevance_category, article_id)) \ No newline at end of file + cursor.execute("""update articles set relevance_category = %s where articles_id = %s""", (relevance_category, article_id)) + + +events = [ + { + "s3": { + "bucket": { + "name": "awstraindata" + }, + "object": { + "key": "output/entities/269854564686-NER-ec0e8172443cfe1bc633b674b3fd4c44/output/output.tar.gz" + } + } + } +] +obj= { + "Records": events +} +lambda_handler(obj, None) \ No newline at end of file From 07dc606918bb998e0c90eebc4f4df5308bf8e463 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 16:09:45 +0530 Subject: [PATCH 030/114] Adding amplify code --- .github/workflows/amplify.yaml | 37 ++++++++++++++++++ amplify.py | 68 ++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 .github/workflows/amplify.yaml create mode 100644 amplify.py diff --git a/.github/workflows/amplify.yaml b/.github/workflows/amplify.yaml new file mode 100644 index 0000000..9b33a58 --- /dev/null +++ b/.github/workflows/amplify.yaml @@ -0,0 +1,37 @@ +name: Deploy to AWS Amplify (Manual) + +on: + workflow_dispatch: + inputs: + repo: + description: 'GitHub repository (e.g. user/repo-name)' + required: true + branch: + description: 'Branch to deploy (e.g. main)' + required: true + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout this repo (if needed) + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip boto3 + + - name: Run Amplify Deployment Script + env: + AWS_ACCESS_KEY_ID: AKIAT5VEV4FHFQQJBZVX + AWS_SECRET_ACCESS_KEY: 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 + AWS_DEFAULT_REGION: 'us-east-1' # change if needed + GITHUB_PAT: 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 + run: | + python amplify.py "${{ github.event.inputs.repo }}" "${{ github.event.inputs.branch }}" diff --git a/amplify.py b/amplify.py new file mode 100644 index 0000000..06a9f6c --- /dev/null +++ b/amplify.py @@ -0,0 +1,68 @@ +import boto3 +import sys +import time + +GITHUB_PAT = os.getenv("GITHUB_PAT") +if not GITHUB_PAT: + raise Exception("Missing GitHub token in GITHUB_PAT environment variable.") + +repo_full = sys.argv[1] # Format: user/repo +branch = sys.argv[2] +repo_owner, repo_name = repo_full.split('/') +app_name = f"amplify-{repo_name}" + +client = boto3.client('amplify') + +# Create Amplify App +app_response = client.create_app( + name=app_name, + repository=f"https://github.com/{repo_owner}/{repo_name}", + oauthToken=GITHUB_PAT, + platform='WEB', + enableBranchAutoBuild=True +) +app_id = app_response['app']['appId'] +print(f"[āœ“] Created Amplify app with ID: {app_id}") + +# Create Branch +branch_response = client.create_branch( + appId=app_id, + branchName=branch, + framework='React', + enableAutoBuild=True +) +print(f"[āœ“] Created branch '{branch}'") + +# Start Deployment +deploy_response = client.start_job( + appId=app_id, + branchName=branch, + jobType='RELEASE', + jobReason='Manual deployment from GitHub Actions' +) +job_id = deploy_response['jobSummary']['jobId'] +print(f"[šŸš€] Deployment started. Job ID: {job_id}") + +# Monitor +while True: + status = client.get_job(appId=app_id, branchName=branch, jobId=job_id)['job']['summary']['status'] + print(f"→ Status: {status}") + if status in ['SUCCEED', 'FAILED', 'CANCELLED']: + print(f"[āœ”] Final Status: {status}") + break + time.sleep(10) + +domain_name = 'www.apainewsbrief.in' + +response = client.create_domain_association( + appId=app_id, + domainName=domain_name, + subDomainSettings=[ + { + 'prefix': '', # '' for root domain, or 'www', 'app', etc. + 'branchName': branch + }, + ] +) + +print("Custom domain association started:", response['domainAssociation']['domainName']) From 2a270264c67b4d46bfc7e85f4c677076639b6a5e Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 16:14:17 +0530 Subject: [PATCH 031/114] database schema changes database schema changes --- backend/input_handler/input_handler_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/input_handler/input_handler_test.py b/backend/input_handler/input_handler_test.py index e6b0e4f..ca74254 100644 --- a/backend/input_handler/input_handler_test.py +++ b/backend/input_handler/input_handler_test.py @@ -24,7 +24,9 @@ body TEXT, source TEXT, published_date TEXT, - entities TEXT, + location_mentions TEXT, + officials_involved TEXT, + relevance_category TEXT, sentiment TEXT )""") cursor.execute(""" From 7bb7bf68df5cf6c49d91808b82c7936104ea2583 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 16:16:37 +0530 Subject: [PATCH 032/114] testing the submit api testing the submit api --- backend/Submit/Submit.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 15c095d..ba7cb52 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -30,9 +30,23 @@ async def upload_docx(file: UploadFile = File(...)): body TEXT, source TEXT, published_date TEXT, - entities TEXT, + location_mentions TEXT, + officials_involved TEXT, + relevance_category TEXT, sentiment TEXT )""") + cursor.execute(""" + drop table if exists comprehend_jobs + """) + cursor.execute(""" + CREATE TABLE IF NOT EXISTS comprehend_jobs ( + article_id TEXT, + input_s3_uri TEXT, + entities_path TEXT, + sentiment_path TEXT, + key_phrases_path TEXT + ) + """) for article in articles: output_csv = io.StringIO() writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) From 385957abf22af734f8a78f82d15820cd7c7db08e Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 16:20:01 +0530 Subject: [PATCH 033/114] new submit api new submit api --- api/upload.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/upload.json b/api/upload.json index acce321..c67b2cb 100644 --- a/api/upload.json +++ b/api/upload.json @@ -4,7 +4,7 @@ "method": { "httpMethod": "POST", "authorizationType": "NONE", - "lambdaFunctionName": "ANother-test" + "lambdaFunctionName": "Submit" }, "deploy": true } \ No newline at end of file From e3ec3798e0d2f8a125ec4337a97ede7e7b09ca45 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 16:23:13 +0530 Subject: [PATCH 034/114] signature fix signature fix --- .github/workflows/lambda.yaml | 2 +- backend/Submit/Submit.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 222c30c..11d15c8 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -83,6 +83,6 @@ jobs: --code S3Bucket="$DEPLOY_BUCKET",S3Key="${function_name}.zip" \ --timeout 900 \ --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 \ - --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:3" # šŸ” Replace with your actual layer ARN + --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:4" # šŸ” Replace with your actual layer ARN fi done diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index ba7cb52..eccec4d 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -14,7 +14,7 @@ BUCKET_NAME = 'awstraindata' @app.post("/upload/") -async def upload_docx(file: UploadFile = File(...)): +async def lambda_handler(file: UploadFile = File(...)): if not file.filename.endswith(".docx"): return JSONResponse(status_code=400, content={"error": "Only .docx files are supported"}) From f738c0f466d22d4048f24fb0d04d53973a86011f Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 16:58:39 +0530 Subject: [PATCH 035/114] corner case fix corner case fix --- backend/input_handler/input_handler_test.py | 2 +- backend/output_handler/Utils.py | 25 ++++++++ backend/output_handler/output_handler.py | 68 +++++++++++++-------- 3 files changed, 67 insertions(+), 28 deletions(-) create mode 100644 backend/output_handler/Utils.py diff --git a/backend/input_handler/input_handler_test.py b/backend/input_handler/input_handler_test.py index ca74254..053b51d 100644 --- a/backend/input_handler/input_handler_test.py +++ b/backend/input_handler/input_handler_test.py @@ -19,7 +19,7 @@ cursor = conn.cursor() cursor.execute("drop table if exists articles") cursor.execute("""CREATE TABLE IF NOT EXISTS articles ( - articles_id TEXT, + article_id TEXT, title TEXT, body TEXT, source TEXT, diff --git a/backend/output_handler/Utils.py b/backend/output_handler/Utils.py new file mode 100644 index 0000000..41ed2b4 --- /dev/null +++ b/backend/output_handler/Utils.py @@ -0,0 +1,25 @@ +import json +import psycopg2 +def get_postgresql_connection(): + '''get the creds from local config''' + + """ + Establish a connection to a PostgreSQL database. + + Parameters: + host (str): The hostname of the PostgreSQL server. + database (str): The name of the database to connect to. + user (str): The username to connect with. + password (str): The password for the user. + + Returns: + psycopg2.extensions.connection: A connection object to the PostgreSQL database. + """ + try: + with open("pg_config.json") as f: + config = json.load(f) + conn = psycopg2.connect(**config) + return conn + except psycopg2.Error as e: + print("Error connecting to PostgreSQL database:", e) + return None \ No newline at end of file diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index f759b07..981e973 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -63,11 +63,11 @@ def lambda_handler(event, context): elif type == 'sentiment': sentiment = result.get('Sentiment', 'NEUTRAL') if sentiment: - cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, article_id)) + cursor.execute("""update articles set sentiment = %s where article_id = %s""", (sentiment, article_id)) conn.commit() cursor.close() ## delete the s3 object - s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) + # s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) conn.close() except Exception as e: print(f"Error processing record: {e}") @@ -78,17 +78,25 @@ def lambda_handler(event, context): def add_entities_to_article(cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] + print(f"Entities to be added: {entities_text}") cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) - entity_db_array = [row[0] for row in cursor.fetchall()] + entity_db_array = cursor.fetchall() + print(f"Entities in DB: {entity_db_array}") location_mentions = [] officials_involved = [] - relevance_category = cursor.execute("SELECT * FROM articles WHERE article_id in %s", (article_id,)).fetchall() + relevance_category = [] + print(f"article_id: {article_id}") + + print(f"Relevance category: {relevance_category}") for entity in entities: - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity['entity'] == entity['Text']] + print(f"Processing entity: {entity}") + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3] == entity['Text']] + print(f"Entity in DB: {entity_in_db}") if not entity_in_db: current_time = datetime.datetime.utcnow() cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) db_entity = cursor.fetchone() + print(f"Inserted new entity: {db_entity}") if entity['Type'] == 'LOCATION': location_mentions.append(db_entity[0]) elif entity['Type'] == 'PERSON': @@ -96,38 +104,44 @@ def add_entities_to_article(cursor, article_id, entities): else: relevance_category.append(db_entity[0]) else: + print(f"Entity already exists in DB: {entity_in_db}") if entity['Type'] == 'LOCATION': - location_mentions.append(entity_in_db[0]['Id']) + location_mentions.append(entity_in_db[0][0]) elif entity['Type'] == 'PERSON': - officials_involved.append(entity_in_db[0]['Id']) + officials_involved.append(entity_in_db[0][0]) else: - relevance_category.append(entity_in_db[0]['Id']) + relevance_category.append(entity_in_db[0][0]) if location_mentions: location_mentions = ','.join(map(str, location_mentions)) - cursor.execute("""update articles set location_mentions = %s where articles_id = %s""", (location_mentions, article_id)) + cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) if officials_involved: officials_involved = ','.join(map(str, officials_involved)) - cursor.execute("""update articles set officials_involved = %s where articles_id = %s""", (officials_involved, article_id)) + cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) if relevance_category: + cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) + existing = cursor.fetchone() relevance_category = ','.join(map(str, relevance_category)) - cursor.execute("""update articles set relevance_category = %s where articles_id = %s""", (relevance_category, article_id)) + if existing[0] is not None: + print(f"Existing relevance category: {existing[0]}") + relevance_category = relevance_category + ',' + existing[0] + cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) -events = [ - { - "s3": { - "bucket": { - "name": "awstraindata" - }, - "object": { - "key": "output/entities/269854564686-NER-ec0e8172443cfe1bc633b674b3fd4c44/output/output.tar.gz" - } - } - } -] -obj= { - "Records": events -} -lambda_handler(obj, None) \ No newline at end of file +# events = [ +# { +# "s3": { +# "bucket": { +# "name": "awstraindata" +# }, +# "object": { +# "key": "output/entities/269854564686-NER-7b5218ec8e556761890504a59e10da02/output/output.tar.gz" +# } +# } +# } +# ] +# obj= { +# "Records": events +# } +# lambda_handler(obj, None) \ No newline at end of file From 7bfd99066a27f0de1b9d7dd88fc7048c38347edd Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 17:01:03 +0530 Subject: [PATCH 036/114] test script update test script update --- backend/input_handler/input_handler_test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/backend/input_handler/input_handler_test.py b/backend/input_handler/input_handler_test.py index 053b51d..5c837a8 100644 --- a/backend/input_handler/input_handler_test.py +++ b/backend/input_handler/input_handler_test.py @@ -44,16 +44,16 @@ input_csv = pd.read_csv(io.BytesIO(input_csv_object['Body'].read())) for index, row in input_csv.iterrows(): print(f"Processing row {index}: {row}") - articles_id = str(uuid.uuid4()) # Generate a unique ID for each article + article_id = str(uuid.uuid4()) # Generate a unique ID for each article cursor.execute(""" - INSERT INTO articles (articles_id, title, body, source, published_date) - VALUES (%s, %s, %s, %s, %s)""", (articles_id, row[1], row[2], row[3], row[4])) + INSERT INTO articles (article_id, title, body, source, published_date) + VALUES (%s, %s, %s, %s, %s)""", (article_id, row[1], row[2], row[3], row[4])) # Convert to CSV in-memory csv_buffer = io.StringIO() writer = csv.writer(csv_buffer) # writer.writerow(row.headers) # Write header writer.writerow(row) - s3_path = 'input/' + articles_id + '.csv' + s3_path = 'input/' + article_id + '.csv' s3_uri = 's3://' + bucket_name + '/' + s3_path s3.put_object( Bucket=bucket_name, @@ -66,7 +66,7 @@ OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, DataAccessRoleArn=role_arn, LanguageCode='en', - JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) + JobName='MyEntityDetectionJob_'+ article_id + '_' + str(int(time.time())) ) result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] @@ -77,7 +77,7 @@ OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, DataAccessRoleArn=role_arn, LanguageCode='en', - JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) + JobName='MySentimentDetectionJob_' + article_id + '_' + str(int(time.time())) ) res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] @@ -88,7 +88,7 @@ OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, DataAccessRoleArn=role_arn, LanguageCode='en', - JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) + JobName='MyKeyPhrasesDetectionJob_' + article_id + '_' + str(int(time.time())) ) res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] @@ -98,7 +98,7 @@ print("Inserting into comprehend_jobs table") cursor.execute(""" INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) - VALUES (%s, %s, %s, %s, %s)""", (articles_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) + VALUES (%s, %s, %s, %s, %s)""", (article_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) conn.commit() cursor.close() conn.close() \ No newline at end of file From c2b1b045d581ca624264e79f722913b0b169e575 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 17:12:35 +0530 Subject: [PATCH 037/114] Fix Submit API Fix Submit API --- backend/Submit/Submit.py | 96 ++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 54 deletions(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index eccec4d..0e06bab 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -1,3 +1,5 @@ +import base64 +from json import decoder import uuid from Utils import get_postgresql_connection from fastapi import FastAPI, UploadFile, File @@ -13,65 +15,51 @@ s3 = boto3.client('s3') BUCKET_NAME = 'awstraindata' -@app.post("/upload/") -async def lambda_handler(file: UploadFile = File(...)): - if not file.filename.endswith(".docx"): - return JSONResponse(status_code=400, content={"error": "Only .docx files are supported"}) - +def lambda_handler(event, context): try: + # Decode base64-encoded body (API Gateway encodes binary automatically) + body = base64.b64decode(event['body']) + + # Get content-type header + content_type = event['headers'].get('Content-Type') or event['headers'].get('content-type') + if not content_type: + return {"statusCode": 400, "body": "Missing Content-Type header"} + + # Parse multipart form + multipart_data = decoder.MultipartDecoder(body, content_type) s3_urls = [] - articles = extract_articles(file.file) conn = get_postgresql_connection() cursor = conn.cursor() - cursor.execute("drop table if exists articles") - cursor.execute("""CREATE TABLE IF NOT EXISTS articles ( - articles_id TEXT, - title TEXT, - body TEXT, - source TEXT, - published_date TEXT, - location_mentions TEXT, - officials_involved TEXT, - relevance_category TEXT, - sentiment TEXT - )""") - cursor.execute(""" - drop table if exists comprehend_jobs - """) - cursor.execute(""" - CREATE TABLE IF NOT EXISTS comprehend_jobs ( - article_id TEXT, - input_s3_uri TEXT, - entities_path TEXT, - sentiment_path TEXT, - key_phrases_path TEXT - ) - """) - for article in articles: - output_csv = io.StringIO() - writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) - writer.writeheader() - writer.writerow(article) - # Generate unique filename - csv_filename = f"/input/articles-{uuid.uuid4()}.csv" - articles_id = str(uuid.uuid4()) # Generate a unique ID for each article - cursor.execute(""" - INSERT INTO articles (articles_id, title, body, source, published_date) - VALUES (%s, %s, %s, %s, %s)""", (articles_id, article[0], article[1], article[2], article[3])) - # Upload to S3 - s3.put_object( - Bucket=BUCKET_NAME, - Key=csv_filename, - Body=output_csv.getvalue(), - ContentType='text/csv' - ) - s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" - s3_urls.append(s3_url) - return {"status": "success", "count": len(articles), "data": articles, "s3_urls": s3_urls} - + for part in multipart_data.parts: + # Extract file name from content-disposition + cd = part.headers.get(b'Content-Disposition', b'').decode() + if 'filename=' not in cd: + continue + filename = cd.split('filename="')[1].split('"')[0] + articles = extract_articles(io.BytesIO(part.content)) + for article in articles: + output_csv = io.StringIO() + writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) + writer.writeheader() + writer.writerow(article) + # Generate unique filename + csv_filename = f"/input/articles-{uuid.uuid4()}.csv" + article_id = str(uuid.uuid4()) # Generate a unique ID for each article + cursor.execute(""" + INSERT INTO articles (article_id, title, body, source, published_date) + VALUES (%s, %s, %s, %s, %s)""", (article_id, article[0], article[1], article[2], article[3])) + # Upload to S3 + s3.put_object( + Bucket=BUCKET_NAME, + Key=csv_filename, + Body=output_csv.getvalue(), + ContentType='text/csv' + ) + s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" + s3_urls.append(s3_url) + return {"statusCode": 200, "status": "success", "count": len(articles), "data": articles, "s3_urls": s3_urls} except Exception as e: - return JSONResponse(status_code=500, content={"error": str(e)}) - + return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} def extract_articles(file_stream): doc = Document(file_stream) From eb3665f8d3c4a5fd11ea097f02e2c10048234600 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 17:13:35 +0530 Subject: [PATCH 038/114] Pushed first commit for web_api --- .gitignore | 3 + backend/requirements.txt | 7 +- backend/web_api/bedrock_agent.py | 89 ++++++++++++ backend/web_api/bedrock_agent_invoke.py | 86 +++++++++++ backend/web_api/database.py | 19 +++ backend/web_api/main.py | 186 ++++++++++++++++++++++++ backend/web_api/models.py | 39 +++++ 7 files changed, 428 insertions(+), 1 deletion(-) create mode 100644 backend/web_api/bedrock_agent.py create mode 100644 backend/web_api/bedrock_agent_invoke.py create mode 100644 backend/web_api/database.py create mode 100644 backend/web_api/main.py create mode 100644 backend/web_api/models.py diff --git a/.gitignore b/.gitignore index fd3a82d..1b02c6d 100644 --- a/.gitignore +++ b/.gitignore @@ -193,3 +193,6 @@ cython_debug/ .cursorignore .cursorindexingignore .DS_Store + +node_modules +package-lock.json \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 4b86188..b74be0d 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -3,4 +3,9 @@ pandas boto3 psycopg2-binary==2.9.9 python-docx -fastapi[all] \ No newline at end of file +fastapi[all] +uvicorn[standard] +psycopg[binary] +psycopg-pool +pydantic +python-dotenv \ No newline at end of file diff --git a/backend/web_api/bedrock_agent.py b/backend/web_api/bedrock_agent.py new file mode 100644 index 0000000..8967dbf --- /dev/null +++ b/backend/web_api/bedrock_agent.py @@ -0,0 +1,89 @@ +# bedrock_agent.py +import boto3 +import json + +# It's a good practice to create the client once and reuse it. +# Ensure your AWS credentials are configured (e.g., via `aws configure`) +# and you have selected a region where the model is available. +bedrock_runtime = boto3.client( + service_name="bedrock-runtime", + region_name="us-east-1" # e.g., us-east-1 +) + +# Choose a model. Claude 3 Sonnet is a great choice for this task. +# You can also use "anthropic.claude-v2:1", "anthropic.claude-instant-v1", etc. +MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" + +def generate_sql_from_prompt(user_question: str, table_schema: str) -> str: + """ + Uses Bedrock to generate a SQL query from a user's natural language question. + + Args: + user_question: The question from the user. + table_schema: The CREATE TABLE statement for the relevant table. + + Returns: + A SQL query string. + """ + + # This prompt engineering is the most critical part of the process. + # It gives the model context, instructions, and constraints. + prompt = f""" +Human: You are a PostgreSQL expert. Your task is to generate a SQL query based on a user's question. +You will be given the database schema and a question. +You MUST follow these rules: +1. ONLY generate a SQL `SELECT` query. Do not generate any other type of query (INSERT, UPDATE, DELETE, etc.). +2. Do not include any text, explanation, or markdown formatting before or after the SQL query. Your entire response must be only the SQL query itself. +3. The query should be for a PostgreSQL database. + +Here is the table schema: + +{table_schema} + + +Here is the user's question: + +{user_question} + + +Assistant: +""" + + # Prepare the payload for the Bedrock API + body = json.dumps({ + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 1000, + "temperature": 0.0, # Use 0 for deterministic, factual responses + "messages": [ + { + "role": "user", + "content": [{"type": "text", "text": prompt}], + } + ], + }) + + try: + # Invoke the model + response = bedrock_runtime.invoke_model( + body=body, + modelId=MODEL_ID, + accept='application/json', + contentType='application/json' + ) + + # Parse the response body + response_body = json.loads(response.get('body').read()) + + # Extract the generated text + generated_sql = response_body.get('content')[0].get('text') + + # Clean up the response (remove potential leading/trailing whitespace or markdown) + cleaned_sql = generated_sql.strip().replace("```sql", "").replace("```", "").strip() + + print(f"Bedrock generated SQL: {cleaned_sql}") + return cleaned_sql + + except Exception as e: + print(f"Error invoking Bedrock model: {e}") + # In a real app, you'd want more robust error handling + raise \ No newline at end of file diff --git a/backend/web_api/bedrock_agent_invoke.py b/backend/web_api/bedrock_agent_invoke.py new file mode 100644 index 0000000..fbe3bbb --- /dev/null +++ b/backend/web_api/bedrock_agent_invoke.py @@ -0,0 +1,86 @@ +# bedrock_agent_invoke.py +import boto3 +import json +from typing import Optional + +# Use the 'bedrock-agent-runtime' client for invoking agents +bedrock_agent_runtime = boto3.client( + service_name="bedrock-agent-runtime", + region_name="us-east-1" # Use the region where your agent is deployed +) + +def invoke_bedrock_agent_to_get_sql( + question: str, + agent_id: str, + agent_alias_id: str, + session_id: str +) -> Optional[str]: + """ + Invokes a pre-configured Bedrock Agent and extracts the generated SQL query + from its response trace. + + Args: + question: The user's natural language question. + agent_id: The ID of your Bedrock Agent. + agent_alias_id: The alias ID for the agent version you want to use. + session_id: A unique identifier for the conversation session. + + Returns: + The generated SQL query string, or None if not found. + """ + try: + # The invoke_agent API returns a streaming response. + response = bedrock_agent_runtime.invoke_agent( + agentId=agent_id, + agentAliasId=agent_alias_id, + sessionId=session_id, + inputText=question, + streamingConfigurations = { + "applyGuardrailInterval" : 20, + "streamFinalResponse" : False + } + ) + + event_stream = response['completion'] + final_sql_query = None + + # The response is a stream of events. We need to parse it to find the + # 'observation' from the action group that contains the final SQL query. + for event in event_stream: + if 'trace' in event: + trace_part = event['trace']['trace'] + if 'observation' in trace_part: + observation = trace_part['observation'] + if 'actionGroupInvocationOutput' in observation: + output_str = observation['actionGroupInvocationOutput']['text'] + # The output is often a JSON string, we need to parse it + try: + output_json = json.loads(output_str) + # The key 'generatedQuery' might vary based on your + # Lambda function's return format for the action group. + # Inspect your agent's trace to find the correct key. + if 'generatedQuery' in output_json: + final_sql_query = output_json['generatedQuery'] + print(f"Extracted SQL from Agent trace: {final_sql_query}") + # We found the query, no need to process further + break + except json.JSONDecodeError: + print(f"Could not decode observation output: {output_str}") + + + if not final_sql_query: + # Fallback if the detailed trace isn't as expected, check final response + # Note: This part is less reliable for getting the raw SQL + for event in event_stream: + if 'chunk' in event: + data = json.loads(event['chunk']['bytes'].decode()) + if data['type'] == 'finalResponse': + print("Warning: Could not find SQL in trace, final response text might not be a query.") + # This text is often a natural language answer, not the SQL itself + break + + return final_sql_query + + except Exception as e: + print(f"Error invoking Bedrock Agent: {e}") + raise \ No newline at end of file diff --git a/backend/web_api/database.py b/backend/web_api/database.py new file mode 100644 index 0000000..52d2342 --- /dev/null +++ b/backend/web_api/database.py @@ -0,0 +1,19 @@ +# database.py +from sqlmodel import create_engine, SQLModel, Session + +# Use a real database URL in production +DATABASE_URL = "postgresql://postgres:!>.VZS)91jj5b0aer@ap-ai-hackathon.cluster-cqt08oi8i1b6.us-east-1.rds.amazonaws.com:5432/postgres?sslmode=require" + +# The 'connect_args' is needed for SQLite, but not for PostgreSQL. +# For PostgreSQL, you can remove it. +engine = create_engine(DATABASE_URL, echo=True) + +def create_db_and_tables(): + # This function creates all tables defined by SQLModel models + # that are subclasses of SQLModel. It's good to run this once at startup. + SQLModel.metadata.create_all(engine) + +# Dependency function to get a database session +def get_session(): + with Session(engine) as session: + yield session \ No newline at end of file diff --git a/backend/web_api/main.py b/backend/web_api/main.py new file mode 100644 index 0000000..7e23adb --- /dev/null +++ b/backend/web_api/main.py @@ -0,0 +1,186 @@ +# main.py +import os +from typing import List +import uuid +from fastapi import FastAPI, Depends, HTTPException, Response +from sqlmodel import Session, select + +from database import get_session, create_db_and_tables +from models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead +from typing import List, Dict, Any + +from contextlib import asynccontextmanager +# from typing import List, Dict, Any + +# from fastapi import FastAPI, HTTPException, Depends +from pydantic import BaseModel, Field +from psycopg_pool import ConnectionPool +from psycopg import ProgrammingError + +# # Import our new Bedrock agent function +# from bedrock_agent import generate_sql_from_prompt + +# Import our new agent invoker function +from bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql + +# Load environment variables from .env file +from dotenv import load_dotenv +load_dotenv() + +# --- Agent Configuration --- +# Load from environment variables for security and flexibility +AGENT_ID = os.environ.get("BEDROCK_AGENT_ID") +AGENT_ALIAS_ID = os.environ.get("BEDROCK_AGENT_ALIAS_ID", "TSTALIASID") # TSTALIASID is a common default + +app = FastAPI() + +@asynccontextmanager +async def lifespan(app: FastAPI): + print("Application startup...") + yield + print("Application shutdown...") + pool.close() + + +# This event handler runs once when the application starts. +# @app.on_event("startup") +# def on_startup(): +# create_db_and_tables() + +@app.post("/articles/", response_model=ArticleRead) +def create_article(hero: ArticleCreate, session: Session = Depends(get_session)): + db_article = Articles.model_validate(hero) + session.add(db_article) + session.commit() + session.refresh(db_article) + return db_article + +@app.get("/articles/", response_model=List[ArticleRead]) +def read_articles(skip: int = 0, limit: int = 100, session: Session = Depends(get_session)): + heroes = session.exec(select(Articles).offset(skip).limit(limit)).all() + return heroes + +@app.get("/articles/{hero_id}", response_model=ArticleRead) +def read_article(hero_id: int, session: Session = Depends(get_session)): + article = session.get(Articles, hero_id) + if not article: + raise HTTPException(status_code=404, detail="Article not found") + return article + +@app.post("/clusters/", response_model=ClusterRead) +def create_cluster(cluster: ClusterCreate, session: Session = Depends(get_session)): + db_cluster = Clusters.model_validate(cluster) + session.add(db_cluster) + session.commit() + session.refresh(db_cluster) + return db_cluster + +@app.get("/clusters/", response_model=List[ClusterRead]) +def read_clusters(skip: int = 0, limit: int = 100, session: Session = Depends(get_session)): + clusters = session.exec(select(Clusters).offset(skip).limit(limit)).all() + return clusters + +@app.get("/clusters/{cluster_id}", response_model=ClusterRead) +def read_cluster(cluster_id: str, session: Session = Depends(get_session)): + cluster = session.get(Clusters, cluster_id) + if not cluster: + raise HTTPException(status_code=404, detail="Cluster not found") + return cluster + +@app.get("/groupedClusters/") +def grouped_clusters(session: Session = Depends(get_session)): + clusters = session.exec(select(Clusters)).all() + articles = session.exec(select(Articles)).all() + # Build a mapping from cluster id to articles + cluster_map = {cluster.id: [] for cluster in clusters} + for article in articles: + # Assuming 'linkedarticles' in Clusters is a comma-separated list of article ids + for cluster in clusters: + if cluster.linkedarticles: + linked_ids = [x.strip() for x in cluster.linkedarticles.split(",") if x.strip()] + if article.articles_id in linked_ids: + cluster_map[cluster.id].append(article) + # Build the response + result = [] + for cluster in clusters: + cluster_dict = cluster.dict() + cluster_dict["articles"] = cluster_map[cluster.id] + result.append(cluster_dict) + return result + + +# --- Pydantic Models --- +class NaturalLanguageQuery(BaseModel): + question: str = Field(..., example="How many heroes are there?") + session_id: str | None = Field(default=None, description="Conversation session ID. A new one is generated if not provided.") + +# --- Database Connection --- +# IMPORTANT: Use a read-only user for the database connection. +DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb") +pool = ConnectionPool(conninfo=DATABASE_URL) + +def get_db_connection(): + with pool.connection() as conn: + yield conn + +@asynccontextmanager +async def lifespan(app: FastAPI): + if not AGENT_ID: + raise RuntimeError("BEDROCK_AGENT_ID environment variable not set.") + print("Application startup...") + yield + print("Application shutdown...") + pool.close() + +app = FastAPI( + title="FastAPI with Bedrock Agents", + lifespan=lifespan +) + +@app.post("/query/agent", response_model=List[Dict[str, Any]]) +def query_with_bedrock_agent(query: NaturalLanguageQuery, conn=Depends(get_db_connection)): + """ + Takes a natural language question, sends it to a pre-configured Bedrock Agent, + executes the returned SQL, and returns the results. + """ + session_id = query.session_id or str(uuid.uuid4()) + print(f"Invoking agent for question: '{query.question}' with session_id: {session_id}") + + # 1. Invoke the agent to get the SQL query + try: + generated_sql = invoke_bedrock_agent_to_get_sql( + question=query.question, + agent_id=AGENT_ID, + agent_alias_id=AGENT_ALIAS_ID, + session_id=session_id + ) + print("Generated SQL from agent:", generated_sql) # Debug print + except Exception as e: + raise HTTPException(status_code=500, detail=f"Failed to invoke Bedrock Agent: {e}") + + if not generated_sql: + raise HTTPException(status_code=404, detail="Agent did not return a SQL query.") + + # 2. *** CRITICAL SECURITY CHECK *** + if not generated_sql.strip().upper().startswith("SELECT"): + raise HTTPException( + status_code=400, + detail="Agent returned a non-SELECT query. Execution aborted." + ) + + # 3. Execute the SQL from the agent + try: + with conn.cursor() as cur: + cur.execute(generated_sql) + if cur.description is None: + return [] + + column_names = [desc[0] for desc in cur.description] + results = cur.fetchall() + return [dict(zip(column_names, row)) for row in results] + + except ProgrammingError as e: + raise HTTPException(status_code=400, detail=f"Invalid SQL Query from Agent: {e}") + except Exception as e: + raise HTTPException(status_code=500, detail=f"Database execution error: {e}") + \ No newline at end of file diff --git a/backend/web_api/models.py b/backend/web_api/models.py new file mode 100644 index 0000000..5f3235d --- /dev/null +++ b/backend/web_api/models.py @@ -0,0 +1,39 @@ +# models.py +from typing import Optional +from sqlmodel import Field, SQLModel + +class Articles(SQLModel, table=True): + articles_id: Optional[str] = Field(default=None, primary_key=True) + title: str = Field(index=True) + body: str + source: str + published_date: Optional[str] = Field(default=None, index=True) + # location_mentions: Optional[str] = Field(default=None, index=True) + # officials_involved: Optional[str] = Field(default=None, index=True) + # relenace_category: Optional[str] = Field(default=None, index=True) + sentiment: Optional[str] = Field(default=None, index=True) + # name: str = Field(index=True) + # secret_name: str + # age: Optional[int] = Field(default=None, index=True) + + +class ArticleCreate(Articles): + pass + +class ArticleRead(Articles): + articles_id: str + + +class Clusters(SQLModel, table=True): + id: Optional[str] = Field(default=None, primary_key=True) + title: str = Field(index=True) + linkedarticles: Optional[str] = Field(default=None, index=True) + startdate: Optional[str] = Field(default=None, index=True) + enddate: Optional[str] = Field(default=None, index=True) + +class ClusterCreate(Clusters): + pass + +class ClusterRead(Clusters): + id: str + From ab6842e2a2bc929d63bc5f135cee011f780aac6f Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 17:17:24 +0530 Subject: [PATCH 039/114] fix fix --- backend/Submit/Submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 0e06bab..c8fd8c0 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -15,7 +15,7 @@ s3 = boto3.client('s3') BUCKET_NAME = 'awstraindata' -def lambda_handler(event, context): +def lambda_handler(event): try: # Decode base64-encoded body (API Gateway encodes binary automatically) body = base64.b64decode(event['body']) From e3f2ee0cd6d1740712604b2ffd26a8230805df38 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:02:23 +0530 Subject: [PATCH 040/114] duplicate entity duplicate entity --- backend/output_handler/output_handler.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index 981e973..8669089 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -53,13 +53,13 @@ def lambda_handler(event, context): entity_array = result['Entities'] if entity_array: ## get the entities from the entities table - add_entities_to_article(cursor, article_id, entity_array) + add_entities_to_article(conn, cursor, article_id, entity_array) elif type == 'keyphrases': keyPhrases_array = result['KeyPhrases'] if keyPhrases_array: for keyPhrase in keyPhrases_array: keyPhrase['Type'] = 'KeyPhrase' - add_entities_to_article(cursor, article_id, keyPhrases_array) + add_entities_to_article(conn, cursor, article_id, keyPhrases_array) elif type == 'sentiment': sentiment = result.get('Sentiment', 'NEUTRAL') if sentiment: @@ -76,7 +76,7 @@ def lambda_handler(event, context): 'body': json.dumps({'error': str(e)}) } -def add_entities_to_article(cursor, article_id, entities): +def add_entities_to_article(conn, cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) @@ -90,11 +90,12 @@ def add_entities_to_article(cursor, article_id, entities): print(f"Relevance category: {relevance_category}") for entity in entities: print(f"Processing entity: {entity}") - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3] == entity['Text']] + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] print(f"Entity in DB: {entity_in_db}") if not entity_in_db: current_time = datetime.datetime.utcnow() cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + conn.commit() db_entity = cursor.fetchone() print(f"Inserted new entity: {db_entity}") if entity['Type'] == 'LOCATION': From 6ebb4c433b4b223f489edb44d6176121928792d9 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 18:04:25 +0530 Subject: [PATCH 041/114] Changed main.py to web_api.py and added package for running fastAPI in lambda --- backend/output_handler/output_handler.py | 9 +++++---- backend/web_api/{main.py => web_api.py} | 10 +++++++++- 2 files changed, 14 insertions(+), 5 deletions(-) rename backend/web_api/{main.py => web_api.py} (97%) diff --git a/backend/output_handler/output_handler.py b/backend/output_handler/output_handler.py index 981e973..8669089 100644 --- a/backend/output_handler/output_handler.py +++ b/backend/output_handler/output_handler.py @@ -53,13 +53,13 @@ def lambda_handler(event, context): entity_array = result['Entities'] if entity_array: ## get the entities from the entities table - add_entities_to_article(cursor, article_id, entity_array) + add_entities_to_article(conn, cursor, article_id, entity_array) elif type == 'keyphrases': keyPhrases_array = result['KeyPhrases'] if keyPhrases_array: for keyPhrase in keyPhrases_array: keyPhrase['Type'] = 'KeyPhrase' - add_entities_to_article(cursor, article_id, keyPhrases_array) + add_entities_to_article(conn, cursor, article_id, keyPhrases_array) elif type == 'sentiment': sentiment = result.get('Sentiment', 'NEUTRAL') if sentiment: @@ -76,7 +76,7 @@ def lambda_handler(event, context): 'body': json.dumps({'error': str(e)}) } -def add_entities_to_article(cursor, article_id, entities): +def add_entities_to_article(conn, cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) @@ -90,11 +90,12 @@ def add_entities_to_article(cursor, article_id, entities): print(f"Relevance category: {relevance_category}") for entity in entities: print(f"Processing entity: {entity}") - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3] == entity['Text']] + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] print(f"Entity in DB: {entity_in_db}") if not entity_in_db: current_time = datetime.datetime.utcnow() cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + conn.commit() db_entity = cursor.fetchone() print(f"Inserted new entity: {db_entity}") if entity['Type'] == 'LOCATION': diff --git a/backend/web_api/main.py b/backend/web_api/web_api.py similarity index 97% rename from backend/web_api/main.py rename to backend/web_api/web_api.py index 7e23adb..c7d37ff 100644 --- a/backend/web_api/main.py +++ b/backend/web_api/web_api.py @@ -16,6 +16,7 @@ from pydantic import BaseModel, Field from psycopg_pool import ConnectionPool from psycopg import ProgrammingError +from mangum import Mangum # # Import our new Bedrock agent function # from bedrock_agent import generate_sql_from_prompt @@ -183,4 +184,11 @@ def query_with_bedrock_agent(query: NaturalLanguageQuery, conn=Depends(get_db_co raise HTTPException(status_code=400, detail=f"Invalid SQL Query from Agent: {e}") except Exception as e: raise HTTPException(status_code=500, detail=f"Database execution error: {e}") - \ No newline at end of file + +handler = Mangum(app) + +def lambda_handler(event, context): + """ + AWS Lambda handler for FastAPI app using Mangum adapter. + """ + return handler(event, context) From ebe2361c02210f21a1ada029af9c6ae6f9719551 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 18:04:03 +0530 Subject: [PATCH 042/114] making layer dynamic --- .github/workflows/lambda.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 11d15c8..fb50488 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -34,6 +34,12 @@ jobs: - name: Deploy Lambda Functions (No dependencies) run: | + LAYER_ARN=$(aws lambda list-layer-versions \ + --layer-name "$LAYER_NAME" \ + --query 'LayerVersions[0].LayerVersionArn' \ + --output text) + + echo "Using latest layer version: $LAYER_ARN" for dir in "$LAMBDA_DIR"/*/; do dir=${dir%/} function_name=$(basename "$dir") @@ -53,7 +59,7 @@ jobs: mkdir -p "$build_dir" # Copy Lambda source file - cp "$entry_point" "$build_dir/" + cp -r "$dir"/* "$build_dir/" # Copy all top-level shared files from backend (excluding directories and requirements.txt) find "$LAMBDA_DIR" -maxdepth 1 -type f ! -name "requirements.txt" -exec cp {} "$build_dir/" \; @@ -73,6 +79,7 @@ jobs: --function-name "$function_name" \ --s3-bucket "$DEPLOY_BUCKET" \ --s3-key "${function_name}.zip" + --layers "$LAYER_ARN" else echo "Creating Lambda: $function_name" aws lambda create-function \ @@ -83,6 +90,6 @@ jobs: --code S3Bucket="$DEPLOY_BUCKET",S3Key="${function_name}.zip" \ --timeout 900 \ --vpc-config SubnetIds=subnet-02e62e34308bb07d5,subnet-0534b99dd34e646f1,SecurityGroupIds=sg-0b9a6b812b30a1107 \ - --layers "arn:aws:lambda:us-east-1:269854564686:layer:my-python-layer:4" # šŸ” Replace with your actual layer ARN + --layers "$LAYER_ARN" fi done From 2158b1373e438527fe57b1e93e4d63df1d7ba257 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 18:05:40 +0530 Subject: [PATCH 043/114] making layer dynamic --- .github/workflows/lambda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index fb50488..2d03b92 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -35,7 +35,7 @@ jobs: - name: Deploy Lambda Functions (No dependencies) run: | LAYER_ARN=$(aws lambda list-layer-versions \ - --layer-name "$LAYER_NAME" \ + --layer-name my-python-layer" \ --query 'LayerVersions[0].LayerVersionArn' \ --output text) From 01a4d63103f63c83c8c006fe19fb0c41b180c445 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 18:06:21 +0530 Subject: [PATCH 044/114] making layer dynamic --- .github/workflows/lambda.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 2d03b92..2794feb 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -35,7 +35,7 @@ jobs: - name: Deploy Lambda Functions (No dependencies) run: | LAYER_ARN=$(aws lambda list-layer-versions \ - --layer-name my-python-layer" \ + --layer-name "my-python-layer" \ --query 'LayerVersions[0].LayerVersionArn' \ --output text) From 4663f109ad5d0c37bf66545d0ecc2f30aeabc2f7 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:07:43 +0530 Subject: [PATCH 045/114] submit api lamba fix submit api lamba fix --- backend/Submit/Submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index c8fd8c0..0e06bab 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -15,7 +15,7 @@ s3 = boto3.client('s3') BUCKET_NAME = 'awstraindata' -def lambda_handler(event): +def lambda_handler(event, context): try: # Decode base64-encoded body (API Gateway encodes binary automatically) body = base64.b64decode(event['body']) From 62ff738579224addba10f75c7a3271777c617fa8 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 18:09:06 +0530 Subject: [PATCH 046/114] making layer dynamic --- .github/workflows/api.yaml | 3 ++- .github/workflows/lambda.yaml | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/api.yaml b/.github/workflows/api.yaml index 1c3153b..275a0ec 100644 --- a/.github/workflows/api.yaml +++ b/.github/workflows/api.yaml @@ -2,7 +2,8 @@ name: Deploy API Gateway on: push: - branches: [main, backend_changes] + paths: + - 'api/*' jobs: deploy-api: diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 2794feb..924884d 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -79,7 +79,6 @@ jobs: --function-name "$function_name" \ --s3-bucket "$DEPLOY_BUCKET" \ --s3-key "${function_name}.zip" - --layers "$LAYER_ARN" else echo "Creating Lambda: $function_name" aws lambda create-function \ From 05086b33db6d4de2a4fa624aeb95f96f93f0b56a Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:12:15 +0530 Subject: [PATCH 047/114] submit API logging submit API logging --- backend/Submit/Submit.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 0e06bab..3119aa0 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -18,8 +18,9 @@ def lambda_handler(event, context): try: # Decode base64-encoded body (API Gateway encodes binary automatically) + print(f"Received event: {event}") body = base64.b64decode(event['body']) - + print(f"Decoded body length: {len(body)} bytes") # Get content-type header content_type = event['headers'].get('Content-Type') or event['headers'].get('content-type') if not content_type: @@ -27,17 +28,17 @@ def lambda_handler(event, context): # Parse multipart form multipart_data = decoder.MultipartDecoder(body, content_type) + print(f"Multipart data parts: {len(multipart_data.parts)}") s3_urls = [] conn = get_postgresql_connection() cursor = conn.cursor() for part in multipart_data.parts: + print(f"Processing part: {part.headers.get(b'Content-Disposition')}") # Extract file name from content-disposition - cd = part.headers.get(b'Content-Disposition', b'').decode() - if 'filename=' not in cd: - continue - filename = cd.split('filename="')[1].split('"')[0] articles = extract_articles(io.BytesIO(part.content)) + print(f"Extracted {len(articles)} articles from part") for article in articles: + print(f"Processing article: {article[0]}") output_csv = io.StringIO() writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) writer.writeheader() @@ -49,12 +50,14 @@ def lambda_handler(event, context): INSERT INTO articles (article_id, title, body, source, published_date) VALUES (%s, %s, %s, %s, %s)""", (article_id, article[0], article[1], article[2], article[3])) # Upload to S3 + print(f"Uploading CSV to S3: {csv_filename}") s3.put_object( Bucket=BUCKET_NAME, Key=csv_filename, Body=output_csv.getvalue(), ContentType='text/csv' ) + print(f"Uploaded CSV to S3: {csv_filename}") s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" s3_urls.append(s3_url) return {"statusCode": 200, "status": "success", "count": len(articles), "data": articles, "s3_urls": s3_urls} From bbab44f99ee7bdf0fe0dd3c9dc5a646288ac8062 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 18:12:45 +0530 Subject: [PATCH 048/114] adding api --- api/web-api.json | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 api/web-api.json diff --git a/api/web-api.json b/api/web-api.json new file mode 100644 index 0000000..da394f3 --- /dev/null +++ b/api/web-api.json @@ -0,0 +1,10 @@ +{ + "apiName": "hackathon", + "resourcePath": "articles", + "method": { + "httpMethod": "GET", + "authorizationType": "NONE", + "lambdaFunctionName": "web_api" + }, + "deploy": true +} \ No newline at end of file From 5358819b8d141d2a94f7d6c93228aa26e282384d Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:18:03 +0530 Subject: [PATCH 049/114] filenames changes filenames changes --- backend/Submit/Submit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 3119aa0..a0a8b0e 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -18,7 +18,7 @@ def lambda_handler(event, context): try: # Decode base64-encoded body (API Gateway encodes binary automatically) - print(f"Received event: {event}") + print(f"Received event") body = base64.b64decode(event['body']) print(f"Decoded body length: {len(body)} bytes") # Get content-type header @@ -43,9 +43,9 @@ def lambda_handler(event, context): writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) writer.writeheader() writer.writerow(article) + article_id = str(uuid.uuid4()) # Generate unique filename - csv_filename = f"/input/articles-{uuid.uuid4()}.csv" - article_id = str(uuid.uuid4()) # Generate a unique ID for each article + csv_filename = f"/input/articles-{article_id}.csv" cursor.execute(""" INSERT INTO articles (article_id, title, body, source, published_date) VALUES (%s, %s, %s, %s, %s)""", (article_id, article[0], article[1], article[2], article[3])) From e3081e1489e9ff2287d268a9eed0f240040a8f53 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:23:10 +0530 Subject: [PATCH 050/114] fix encoding issue fix encoding issue --- backend/Submit/Submit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index a0a8b0e..3f62418 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -19,7 +19,10 @@ def lambda_handler(event, context): try: # Decode base64-encoded body (API Gateway encodes binary automatically) print(f"Received event") - body = base64.b64decode(event['body']) + if event.get("isBase64Encoded", False): + body = base64.b64decode(event['body']) + else: + body = event['body'].encode("utf-8") print(f"Decoded body length: {len(body)} bytes") # Get content-type header content_type = event['headers'].get('Content-Type') or event['headers'].get('content-type') From 037a9276202a99b7937b1afcc6589735d473c106 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:26:44 +0530 Subject: [PATCH 051/114] Fix CSV upload and add requests-toolbelt dependency Corrected the CSV file path and ensured database connections are properly closed in Submit.py. Replaced json.decoder with requests_toolbelt.multipart.decoder and added requests-toolbelt to requirements.txt. --- backend/Submit/Submit.py | 7 +++++-- backend/requirements.txt | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 3f62418..7a2466a 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -1,5 +1,5 @@ import base64 -from json import decoder +from requests_toolbelt.multipart import decoder import uuid from Utils import get_postgresql_connection from fastapi import FastAPI, UploadFile, File @@ -48,7 +48,7 @@ def lambda_handler(event, context): writer.writerow(article) article_id = str(uuid.uuid4()) # Generate unique filename - csv_filename = f"/input/articles-{article_id}.csv" + csv_filename = f"input/articles-{article_id}.csv" cursor.execute(""" INSERT INTO articles (article_id, title, body, source, published_date) VALUES (%s, %s, %s, %s, %s)""", (article_id, article[0], article[1], article[2], article[3])) @@ -60,9 +60,12 @@ def lambda_handler(event, context): Body=output_csv.getvalue(), ContentType='text/csv' ) + conn.commit() print(f"Uploaded CSV to S3: {csv_filename}") s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" s3_urls.append(s3_url) + cursor.close() + conn.close() return {"statusCode": 200, "status": "success", "count": len(articles), "data": articles, "s3_urls": s3_urls} except Exception as e: return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} diff --git a/backend/requirements.txt b/backend/requirements.txt index b74be0d..e2bba92 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,4 +8,5 @@ uvicorn[standard] psycopg[binary] psycopg-pool pydantic -python-dotenv \ No newline at end of file +python-dotenv +requests-toolbelt \ No newline at end of file From c8502ceea761dc1a219d2f17a2105838e1a14b73 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 18:35:37 +0530 Subject: [PATCH 052/114] adding condition to update layer version --- .github/workflows/layer.yaml | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/.github/workflows/layer.yaml b/.github/workflows/layer.yaml index b4c49a1..299a7f9 100644 --- a/.github/workflows/layer.yaml +++ b/.github/workflows/layer.yaml @@ -61,4 +61,28 @@ jobs: uses: actions/upload-artifact@v4 with: name: lambda-layer - path: layer.zip \ No newline at end of file + path: layer.zip + + - name: Get latest layer version ARN + id: get-layer-version + run: | + LAYER_ARN=$(aws lambda list-layer-versions --layer-name ${{ env.LAYER_NAME }} \ + --query 'LayerVersions[0].LayerVersionArn' --output text) + echo "layer_arn=$LAYER_ARN" >> "$GITHUB_OUTPUT" + + - name: List functions using the layer + id: list-functions + run: | + FUNCTIONS=$(aws lambda list-functions --query \ + "Functions[?Layers && contains(join(',', Layers[].Arn), '${{ env.LAYER_NAME }}')].FunctionName" \ + --output text) + echo "functions=$FUNCTIONS" >> "$GITHUB_OUTPUT" + + - name: Update functions to use latest layer version + run: | + for function in ${{ steps.list-functions.outputs.functions }}; do + echo "Updating $function..." + aws lambda update-function-configuration \ + --function-name "$function" \ + --layers ${{ steps.get-layer-version.outputs.layer_arn }} + done From ea1f7d46fad553065307425031a63a31057e154b Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 18:35:44 +0530 Subject: [PATCH 053/114] Changing packages as web_api was throwing error --- backend/requirements.txt | 7 +++---- backend/web_api/web_api.py | 26 ++++++++------------------ 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index e2bba92..d0ba366 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,12 +1,11 @@ numpy pandas boto3 -psycopg2-binary==2.9.9 python-docx fastapi[all] uvicorn[standard] -psycopg[binary] -psycopg-pool pydantic python-dotenv -requests-toolbelt \ No newline at end of file +requests-toolbelt +sqlmodel +psycopg2 \ No newline at end of file diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index c7d37ff..eb7bacd 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -9,13 +9,12 @@ from models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead from typing import List, Dict, Any -from contextlib import asynccontextmanager +from contextlib import asynccontextmanager, closing # from typing import List, Dict, Any # from fastapi import FastAPI, HTTPException, Depends from pydantic import BaseModel, Field -from psycopg_pool import ConnectionPool -from psycopg import ProgrammingError +from psycopg2 import connect, ProgrammingError from mangum import Mangum # # Import our new Bedrock agent function @@ -40,7 +39,6 @@ async def lifespan(app: FastAPI): print("Application startup...") yield print("Application shutdown...") - pool.close() # This event handler runs once when the application starts. @@ -118,24 +116,16 @@ class NaturalLanguageQuery(BaseModel): # --- Database Connection --- # IMPORTANT: Use a read-only user for the database connection. DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb") -pool = ConnectionPool(conninfo=DATABASE_URL) def get_db_connection(): - with pool.connection() as conn: + conn = connect(DATABASE_URL) + try: yield conn - -@asynccontextmanager -async def lifespan(app: FastAPI): - if not AGENT_ID: - raise RuntimeError("BEDROCK_AGENT_ID environment variable not set.") - print("Application startup...") - yield - print("Application shutdown...") - pool.close() + finally: + conn.close() app = FastAPI( - title="FastAPI with Bedrock Agents", - lifespan=lifespan + title="FastAPI with Bedrock Agents" ) @app.post("/query/agent", response_model=List[Dict[str, Any]]) @@ -171,7 +161,7 @@ def query_with_bedrock_agent(query: NaturalLanguageQuery, conn=Depends(get_db_co # 3. Execute the SQL from the agent try: - with conn.cursor() as cur: + with closing(conn.cursor()) as cur: cur.execute(generated_sql) if cur.description is None: return [] From 69b92e8ecf7107de93cbf5ad3d7901e13cd68ef0 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 18:40:12 +0530 Subject: [PATCH 054/114] Error fix --- backend/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index d0ba366..e6e29ab 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,4 +8,4 @@ pydantic python-dotenv requests-toolbelt sqlmodel -psycopg2 \ No newline at end of file +psycopg[binary] \ No newline at end of file From 1a4c73d91f5261205c5c5cb7fc9b4c02ea565384 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:41:01 +0530 Subject: [PATCH 055/114] library fix requests_toolbelt --- backend/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index e6e29ab..2fdfafa 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -6,6 +6,6 @@ fastapi[all] uvicorn[standard] pydantic python-dotenv -requests-toolbelt +requests_toolbelt sqlmodel psycopg[binary] \ No newline at end of file From 885e446d3e5c26014633fbf9eedffeabe531645f Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:47:37 +0530 Subject: [PATCH 056/114] library conflict fix library conflict fix --- .github/workflows/lambda.yaml | 4 ++-- backend/Submit/Submit.py | 3 +-- backend/requirements.txt | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 924884d..2259a12 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -2,8 +2,8 @@ name: Deploy Lambdas on: push: - branches: [main, backend_changes] - workflow_dispatch: + paths: + - 'backend/*' env: LAMBDA_DIR: backend diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 7a2466a..ac3daca 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -2,8 +2,7 @@ from requests_toolbelt.multipart import decoder import uuid from Utils import get_postgresql_connection -from fastapi import FastAPI, UploadFile, File -from fastapi.responses import JSONResponse +from fastapi import FastAPI from docx import Document import csv import io diff --git a/backend/requirements.txt b/backend/requirements.txt index 2fdfafa..247a762 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,4 +8,4 @@ pydantic python-dotenv requests_toolbelt sqlmodel -psycopg[binary] \ No newline at end of file +psycopg2-binary==2.9.9 \ No newline at end of file From 840ed19a04123dffe8afb31cb9c58ea08891759b Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 18:58:52 +0530 Subject: [PATCH 057/114] file read debug --- backend/Submit/Submit.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index ac3daca..bdfb2cc 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -67,22 +67,28 @@ def lambda_handler(event, context): conn.close() return {"statusCode": 200, "status": "success", "count": len(articles), "data": articles, "s3_urls": s3_urls} except Exception as e: + traceback.print_exc() return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} def extract_articles(file_stream): + print(f"Extracting articles from file stream") doc = Document(file_stream) + print(f"Document loaded with {len(doc.paragraphs)} paragraphs") text = "\n".join(p.text for p in doc.paragraphs) pattern = re.compile( r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', re.DOTALL ) matches = pattern.findall(text) + print(f"Found {len(matches)} matches in the document") articles = [] for match in matches: + print(f"Processing match: {match}") title = match[0].strip() source = match[1].strip() date_parts = match[2].strip().split("\n", 1) date = date_parts[0].strip() content = date_parts[1].strip() if len(date_parts) > 1 else "" + print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") articles.append([title, source, date, content]) return articles \ No newline at end of file From 9d66bb9424a2f044683c5b55773500a922302c07 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 19:01:06 +0530 Subject: [PATCH 058/114] adding condition to update layer version --- .github/workflows/layer.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/layer.yaml b/.github/workflows/layer.yaml index 299a7f9..d0553b3 100644 --- a/.github/workflows/layer.yaml +++ b/.github/workflows/layer.yaml @@ -2,9 +2,7 @@ name: Build Lambda Layer on: push: - paths: - - 'backend/requirements.txt' - workflow_dispatch: + branches: [main, backend_changes] jobs: build-and-publish-layer: From 07e02f57f718c61e8122af4c03e0e420f00f77e0 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 19:01:55 +0530 Subject: [PATCH 059/114] adding condition to update layer version --- .github/workflows/lambda.yaml | 5 +++-- .github/workflows/layer.yaml | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/lambda.yaml b/.github/workflows/lambda.yaml index 2259a12..45cd394 100644 --- a/.github/workflows/lambda.yaml +++ b/.github/workflows/lambda.yaml @@ -2,8 +2,9 @@ name: Deploy Lambdas on: push: - paths: - - 'backend/*' + branches: + - main + - backend_changes env: LAMBDA_DIR: backend diff --git a/.github/workflows/layer.yaml b/.github/workflows/layer.yaml index d0553b3..299a7f9 100644 --- a/.github/workflows/layer.yaml +++ b/.github/workflows/layer.yaml @@ -2,7 +2,9 @@ name: Build Lambda Layer on: push: - branches: [main, backend_changes] + paths: + - 'backend/requirements.txt' + workflow_dispatch: jobs: build-and-publish-layer: From c93a1e307198112759934a4f13193e3083f82483 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 19:09:38 +0530 Subject: [PATCH 060/114] added trace added trace --- backend/Submit/Submit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index bdfb2cc..9b63556 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -8,6 +8,7 @@ import io import re import boto3 +import traceback app = FastAPI() From c9c95bd0e4b8231ed889be033d95a557945dcdaa Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 19:17:15 +0530 Subject: [PATCH 061/114] Another trial --- backend/requirements.txt | 2 +- backend/web_api/database.py | 6 +++++- backend/web_api/web_api.py | 12 +++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 247a762..51bd887 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,4 +8,4 @@ pydantic python-dotenv requests_toolbelt sqlmodel -psycopg2-binary==2.9.9 \ No newline at end of file +psycopg2-binary \ No newline at end of file diff --git a/backend/web_api/database.py b/backend/web_api/database.py index 52d2342..df5feb5 100644 --- a/backend/web_api/database.py +++ b/backend/web_api/database.py @@ -1,8 +1,12 @@ # database.py +import os from sqlmodel import create_engine, SQLModel, Session # Use a real database URL in production -DATABASE_URL = "postgresql://postgres:!>.VZS)91jj5b0aer@ap-ai-hackathon.cluster-cqt08oi8i1b6.us-east-1.rds.amazonaws.com:5432/postgres?sslmode=require" +DATABASE_URL = os.environ.get( + "DATABASE_URL", + "postgresql://your_user:your_password@your_aurora_endpoint/myappdb" +).replace("postgresql://", "postgresql+psycopg://") # The 'connect_args' is needed for SQLite, but not for PostgreSQL. # For PostgreSQL, you can remove it. diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index eb7bacd..3e0619e 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -14,7 +14,8 @@ # from fastapi import FastAPI, HTTPException, Depends from pydantic import BaseModel, Field -from psycopg2 import connect, ProgrammingError +from psycopg import connect, ProgrammingError +from contextlib import closing from mangum import Mangum # # Import our new Bedrock agent function @@ -115,14 +116,11 @@ class NaturalLanguageQuery(BaseModel): # --- Database Connection --- # IMPORTANT: Use a read-only user for the database connection. -DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb") +DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb").replace("postgresql://", "postgresql+psycopg://") def get_db_connection(): - conn = connect(DATABASE_URL) - try: + with connect(DATABASE_URL) as conn: yield conn - finally: - conn.close() app = FastAPI( title="FastAPI with Bedrock Agents" @@ -161,7 +159,7 @@ def query_with_bedrock_agent(query: NaturalLanguageQuery, conn=Depends(get_db_co # 3. Execute the SQL from the agent try: - with closing(conn.cursor()) as cur: + with conn.cursor() as cur: cur.execute(generated_sql) if cur.description is None: return [] From f2f87327d689d551c1d4a772b7f3ae04645036df Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 19:23:55 +0530 Subject: [PATCH 062/114] parsing fix parsing fix --- backend/Submit/Submit.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 9b63556..6d877fc 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -91,5 +91,10 @@ def extract_articles(file_stream): date = date_parts[0].strip() content = date_parts[1].strip() if len(date_parts) > 1 else "" print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") - articles.append([title, source, date, content]) + articles.append({ + "Title": title, + "Source": source, + "Date": date, + "Content": content + }) return articles \ No newline at end of file From c3453f844f40489949cc8323fbcb8cfabb777bfc Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 19:24:30 +0530 Subject: [PATCH 063/114] Another fix --- backend/web_api/web_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index 3e0619e..d3ac649 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -14,7 +14,7 @@ # from fastapi import FastAPI, HTTPException, Depends from pydantic import BaseModel, Field -from psycopg import connect, ProgrammingError +from psycopg2 import connect, ProgrammingError from contextlib import closing from mangum import Mangum From e70a542ddde25448cfe519545f40272fa6b51c13 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 19:29:16 +0530 Subject: [PATCH 064/114] removed redanduct print removed redanduct print --- backend/Submit/Submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 6d877fc..9ee4e37 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -41,7 +41,7 @@ def lambda_handler(event, context): articles = extract_articles(io.BytesIO(part.content)) print(f"Extracted {len(articles)} articles from part") for article in articles: - print(f"Processing article: {article[0]}") + # print(f"Processing article: {article[0]}") output_csv = io.StringIO() writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) writer.writeheader() From 59bf78fcb1e85d9dc58d7bdb1434506595ebb9cc Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 19:32:22 +0530 Subject: [PATCH 065/114] fix fix --- backend/Submit/Submit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 9ee4e37..5f231a5 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -41,7 +41,6 @@ def lambda_handler(event, context): articles = extract_articles(io.BytesIO(part.content)) print(f"Extracted {len(articles)} articles from part") for article in articles: - # print(f"Processing article: {article[0]}") output_csv = io.StringIO() writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) writer.writeheader() From fb785f52afe5b8e0350e5bc091ed47caf0a724ad Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 19:34:17 +0530 Subject: [PATCH 066/114] fix2 fix2 --- backend/Submit/Submit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 5f231a5..1866af5 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -41,6 +41,7 @@ def lambda_handler(event, context): articles = extract_articles(io.BytesIO(part.content)) print(f"Extracted {len(articles)} articles from part") for article in articles: + print(f"Processing article: {article['Title']}") output_csv = io.StringIO() writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) writer.writeheader() @@ -50,7 +51,7 @@ def lambda_handler(event, context): csv_filename = f"input/articles-{article_id}.csv" cursor.execute(""" INSERT INTO articles (article_id, title, body, source, published_date) - VALUES (%s, %s, %s, %s, %s)""", (article_id, article[0], article[1], article[2], article[3])) + VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) # Upload to S3 print(f"Uploading CSV to S3: {csv_filename}") s3.put_object( From b563ff9c98184605db82bf2db4e227672ad6bdb2 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 19:34:49 +0530 Subject: [PATCH 067/114] Another fixv2 --- backend/web_api/web_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index d3ac649..c0bb765 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -116,7 +116,7 @@ class NaturalLanguageQuery(BaseModel): # --- Database Connection --- # IMPORTANT: Use a read-only user for the database connection. -DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb").replace("postgresql://", "postgresql+psycopg://") +DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb") def get_db_connection(): with connect(DATABASE_URL) as conn: From 701176185c0b9d9e569a0ef5eadf994b7302e63e Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 19:35:34 +0530 Subject: [PATCH 068/114] updated response data updated response data --- backend/Submit/Submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 1866af5..11a6b4d 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -66,7 +66,7 @@ def lambda_handler(event, context): s3_urls.append(s3_url) cursor.close() conn.close() - return {"statusCode": 200, "status": "success", "count": len(articles), "data": articles, "s3_urls": s3_urls} + return {"statusCode": 200, "status": "success", "s3_urls": s3_urls} except Exception as e: traceback.print_exc() return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} From 4d836cd7e921aaa206f4b0ca825f1c69b97a481f Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 19:47:26 +0530 Subject: [PATCH 069/114] response fix response fix --- backend/Submit/Submit.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 11a6b4d..a497ce7 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -1,4 +1,5 @@ import base64 +import json from requests_toolbelt.multipart import decoder import uuid from Utils import get_postgresql_connection @@ -66,7 +67,16 @@ def lambda_handler(event, context): s3_urls.append(s3_url) cursor.close() conn.close() - return {"statusCode": 200, "status": "success", "s3_urls": s3_urls} + return { + "statusCode": 200, + "headers": { + "Content-Type": "application/json" + }, + "body": json.dumps({ + "status": "success", + "s3_urls": s3_urls + }) + } except Exception as e: traceback.print_exc() return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} From d1d0d327af6f29d34335a56301ff40539c4b50ef Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 20:05:58 +0530 Subject: [PATCH 070/114] parser optimization parser optimization --- backend/Submit/Submit copy.py | 122 ++++++++++++++++++ backend/Submit/Submit.py | 71 ++--------- backend/raw_data_handler/raw_data_handler.py | 126 +++++++++++++++++++ 3 files changed, 261 insertions(+), 58 deletions(-) create mode 100644 backend/Submit/Submit copy.py create mode 100644 backend/raw_data_handler/raw_data_handler.py diff --git a/backend/Submit/Submit copy.py b/backend/Submit/Submit copy.py new file mode 100644 index 0000000..95bf690 --- /dev/null +++ b/backend/Submit/Submit copy.py @@ -0,0 +1,122 @@ +import base64 +import json +from requests_toolbelt.multipart import decoder +import uuid +from Utils import get_postgresql_connection +from fastapi import FastAPI +from docx import Document +import csv +import io +import re +import boto3 +import traceback + +app = FastAPI() + +s3 = boto3.client('s3') +BUCKET_NAME = 'awstraindata' + +def lambda_handler(event, context): + try: + # Decode base64-encoded body (API Gateway encodes binary automatically) + print(f"Received event") + if event.get("isBase64Encoded", False): + body = base64.b64decode(event['body']) + else: + body = event['body'].encode("utf-8") + print(f"Decoded body length: {len(body)} bytes") + # Get content-type header + content_type = event['headers'].get('Content-Type') or event['headers'].get('content-type') + if not content_type: + return {"statusCode": 400, "body": "Missing Content-Type header"} + + # Parse multipart form + multipart_data = decoder.MultipartDecoder(body, content_type) + print(f"Multipart data parts: {len(multipart_data.parts)}") + s3_urls = [] + conn = get_postgresql_connection() + cursor = conn.cursor() + for part in multipart_data.parts: + print(f"Processing part: {part.headers.get(b'Content-Disposition')}") + filename = part.headers.get(b'Content-Disposition').decode().split('filename="')[1].split('"')[0] + file_stream = io.BytesIO(part.content) + file_stream.seek(0) + file_id = str(uuid.uuid4()) + s3_key = f"raw_data/{file_id}-{filename}" + # Upload to S3 + s3.put_object( + Bucket=BUCKET_NAME, + Key=s3_key, + Body=file_stream, + ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document' + ) + # Extract file name from content-disposition + articles = extract_articles(io.BytesIO(part.content)) + print(f"Extracted {len(articles)} articles from part") + for article in articles: + print(f"Processing article: {article['Title']}") + output_csv = io.StringIO() + writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) + writer.writeheader() + writer.writerow(article) + article_id = str(uuid.uuid4()) + # Generate unique filename + csv_filename = f"input/articles-{article_id}.csv" + cursor.execute(""" + INSERT INTO articles (article_id, title, body, source, published_date) + VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) + # Upload to S3 + print(f"Uploading CSV to S3: {csv_filename}") + s3.put_object( + Bucket=BUCKET_NAME, + Key=csv_filename, + Body=output_csv.getvalue(), + ContentType='text/csv' + ) + conn.commit() + print(f"Uploaded CSV to S3: {csv_filename}") + s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" + s3_urls.append(s3_url) + cursor.close() + conn.close() + return { + "statusCode": 200, + "headers": { + "Content-Type": "application/json" + }, + "body": json.dumps({ + "status": "success", + "s3_urls": s3_urls + }) + } + except Exception as e: + traceback.print_exc() + return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} + +def extract_articles(file_stream): + print(f"Extracting articles from file stream") + doc = Document(file_stream) + print(f"Document loaded with {len(doc.paragraphs)} paragraphs") + text = "\n".join(p.text for p in doc.paragraphs) + pattern = re.compile( + r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', + re.DOTALL + ) + matches = pattern.findall(text) + print(f"Found {len(matches)} matches in the document") + articles = [] + for match in matches: + print(f"Processing match: {match}") + title = match[0].strip() + source = match[1].strip() + date_parts = match[2].strip().split("\n", 1) + date = date_parts[0].strip() + content = date_parts[1].strip() if len(date_parts) > 1 else "" + print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") + articles.append({ + "Title": title, + "Source": source, + "Date": date, + "Content": content + }) + return articles \ No newline at end of file diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index a497ce7..72c629b 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -38,35 +38,18 @@ def lambda_handler(event, context): cursor = conn.cursor() for part in multipart_data.parts: print(f"Processing part: {part.headers.get(b'Content-Disposition')}") - # Extract file name from content-disposition - articles = extract_articles(io.BytesIO(part.content)) - print(f"Extracted {len(articles)} articles from part") - for article in articles: - print(f"Processing article: {article['Title']}") - output_csv = io.StringIO() - writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) - writer.writeheader() - writer.writerow(article) - article_id = str(uuid.uuid4()) - # Generate unique filename - csv_filename = f"input/articles-{article_id}.csv" - cursor.execute(""" - INSERT INTO articles (article_id, title, body, source, published_date) - VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) - # Upload to S3 - print(f"Uploading CSV to S3: {csv_filename}") - s3.put_object( - Bucket=BUCKET_NAME, - Key=csv_filename, - Body=output_csv.getvalue(), - ContentType='text/csv' - ) - conn.commit() - print(f"Uploaded CSV to S3: {csv_filename}") - s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" - s3_urls.append(s3_url) - cursor.close() - conn.close() + filename = part.headers.get(b'Content-Disposition').decode().split('filename="')[1].split('"')[0] + file_stream = io.BytesIO(part.content) + file_stream.seek(0) + file_id = str(uuid.uuid4()) + s3_key = f"raw_data/{file_id}-{filename}" + # Upload to S3 + s3.put_object( + Bucket=BUCKET_NAME, + Key=s3_key, + Body=file_stream, + ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document' + ) return { "statusCode": 200, "headers": { @@ -79,32 +62,4 @@ def lambda_handler(event, context): } except Exception as e: traceback.print_exc() - return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} - -def extract_articles(file_stream): - print(f"Extracting articles from file stream") - doc = Document(file_stream) - print(f"Document loaded with {len(doc.paragraphs)} paragraphs") - text = "\n".join(p.text for p in doc.paragraphs) - pattern = re.compile( - r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', - re.DOTALL - ) - matches = pattern.findall(text) - print(f"Found {len(matches)} matches in the document") - articles = [] - for match in matches: - print(f"Processing match: {match}") - title = match[0].strip() - source = match[1].strip() - date_parts = match[2].strip().split("\n", 1) - date = date_parts[0].strip() - content = date_parts[1].strip() if len(date_parts) > 1 else "" - print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") - articles.append({ - "Title": title, - "Source": source, - "Date": date, - "Content": content - }) - return articles \ No newline at end of file + return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} \ No newline at end of file diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py new file mode 100644 index 0000000..5558550 --- /dev/null +++ b/backend/raw_data_handler/raw_data_handler.py @@ -0,0 +1,126 @@ +import base64 +import json +import time +import uuid +from Utils import get_postgresql_connection +from fastapi import FastAPI +from docx import Document +import csv +import io +import re +import boto3 +import traceback + +BUCKET_NAME = 'awstraindata' +role = 'arn:aws:iam::269854564686:role/hackathon-comprehend-role' +def lambda_handler(event, context): + try: + conn = get_postgresql_connection() + cursor = conn.cursor() + for record in event['Records']: + print(f"New record: {record}") + bucket = record['s3']['bucket']['name'] + key = record['s3']['object']['key'] + print(f"Processing file from bucket: {bucket}, key: {key}") + s3 = boto3.client('s3') + print(f"Connecting to S3 bucket: {bucket}") + obj = s3.get_object(Bucket=bucket, Key=key) + stream = io.BytesIO(obj['Body'].read()) + articles = extract_articles(stream) + print(f"Extracted {len(articles)} articles from part") + for article in articles: + print(f"Processing article: {article['Title']}") + output_csv = io.StringIO() + writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) + writer.writeheader() + writer.writerow(article) + article_id = str(uuid.uuid4()) + # Generate unique filename + csv_filename = f"input/articles-{article_id}.csv" + cursor.execute(""" + INSERT INTO articles (article_id, title, body, source, published_date) + VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) + # Upload to S3 + print(f"Uploading CSV to S3: {csv_filename}") + s3.put_object( + Bucket=BUCKET_NAME, + Key=csv_filename, + Body=output_csv.getvalue(), + ContentType='text/csv' + ) + conn.commit() + start_jobs(f's3://{BUCKET_NAME}/{csv_filename}', article_id, boto3.client('comprehend'), role, cursor, conn) + cursor.close() + conn.close() + except Exception as e: + traceback.print_exc() + print(f"Error processing event: {e}") + +def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): + entities_job = comprehend.start_entities_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) + ) + result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) + entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] + + # SENTIMENT detection job + sentiment_job = comprehend.start_sentiment_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) + ) + res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) + sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] + + # KEY PHRASES detection job + phrases_job = comprehend.start_key_phrases_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) + ) + res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) + key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] + print("Entities Job Response:", entities_output) + print("Sentiment Job Response:", sentiment_output) + print("Key Phrases Job Response:", key_phrases_output) + print("Inserting into comprehend_jobs table") + cursor.execute(""" + INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) + VALUES (%s, %s, %s, %s, %s)""", (articles_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) + conn.commit() + +def extract_articles(file_stream): + print(f"Extracting articles from file stream") + doc = Document(file_stream) + print(f"Document loaded with {len(doc.paragraphs)} paragraphs") + text = "\n".join(p.text for p in doc.paragraphs) + pattern = re.compile( + r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', + re.DOTALL + ) + matches = pattern.findall(text) + print(f"Found {len(matches)} matches in the document") + articles = [] + for match in matches: + print(f"Processing match: {match}") + title = match[0].strip() + source = match[1].strip() + date_parts = match[2].strip().split("\n", 1) + date = date_parts[0].strip() + content = date_parts[1].strip() if len(date_parts) > 1 else "" + print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") + articles.append({ + "Title": title, + "Source": source, + "Date": date, + "Content": content + }) + return articles From 6aa318642121a02f5ff2442fb637d13f2ee012d6 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 20:15:07 +0530 Subject: [PATCH 071/114] file space issue file space issue --- backend/Submit/Submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 72c629b..b6e704b 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -42,7 +42,7 @@ def lambda_handler(event, context): file_stream = io.BytesIO(part.content) file_stream.seek(0) file_id = str(uuid.uuid4()) - s3_key = f"raw_data/{file_id}-{filename}" + s3_key = f"raw_data/{file_id}-{filename.replace(' ', '_').replace('/', '_')}" # Upload to S3 s3.put_object( Bucket=BUCKET_NAME, From 15a20c6dd4c3545a10336e8fd0679fd14710c48d Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 20:21:34 +0530 Subject: [PATCH 072/114] added region added region --- backend/raw_data_handler/raw_data_handler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 5558550..7fe4f18 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -17,6 +17,7 @@ def lambda_handler(event, context): try: conn = get_postgresql_connection() cursor = conn.cursor() + comprehend = boto3.client('comprehend', region_name='us-east-1') for record in event['Records']: print(f"New record: {record}") bucket = record['s3']['bucket']['name'] @@ -49,7 +50,7 @@ def lambda_handler(event, context): ContentType='text/csv' ) conn.commit() - start_jobs(f's3://{BUCKET_NAME}/{csv_filename}', article_id, boto3.client('comprehend'), role, cursor, conn) + start_jobs(f's3://{BUCKET_NAME}/{csv_filename}', article_id, comprehend, role, cursor, conn) cursor.close() conn.close() except Exception as e: @@ -57,6 +58,8 @@ def lambda_handler(event, context): print(f"Error processing event: {e}") def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): + print(f"Starting jobs for article ID: {articles_id}") + entities_job = comprehend.start_entities_detection_job( InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, From 19c0bcbb89e686fd1cdb16428d15a5568908dcdd Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 20:23:13 +0530 Subject: [PATCH 073/114] improved logging improved logging --- backend/raw_data_handler/raw_data_handler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 7fe4f18..8e63f57 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -59,7 +59,6 @@ def lambda_handler(event, context): def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): print(f"Starting jobs for article ID: {articles_id}") - entities_job = comprehend.start_entities_detection_job( InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, @@ -67,7 +66,9 @@ def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): LanguageCode='en', JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) ) + print(f"Entities job started: {entities_job['JobId']}") result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) + print(f"Entities job description: {result}") entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] # SENTIMENT detection job @@ -78,7 +79,9 @@ def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): LanguageCode='en', JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) ) + print(f"Sentiment job started: {sentiment_job['JobId']}") res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) + print(f"Sentiment job description: {res}") sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] # KEY PHRASES detection job @@ -89,7 +92,9 @@ def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): LanguageCode='en', JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) ) + print(f"Key Phrases job started: {phrases_job['JobId']}") res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) + print(f"Key Phrases job description: {res}") key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] print("Entities Job Response:", entities_output) print("Sentiment Job Response:", sentiment_output) From 04d24a80f4a2585f199bfa4e7cd1ff0969398a68 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 21:26:24 +0530 Subject: [PATCH 074/114] file name changes file name changes --- backend/Submit/Submit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index b6e704b..a0cfc47 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -38,11 +38,10 @@ def lambda_handler(event, context): cursor = conn.cursor() for part in multipart_data.parts: print(f"Processing part: {part.headers.get(b'Content-Disposition')}") - filename = part.headers.get(b'Content-Disposition').decode().split('filename="')[1].split('"')[0] file_stream = io.BytesIO(part.content) file_stream.seek(0) file_id = str(uuid.uuid4()) - s3_key = f"raw_data/{file_id}-{filename.replace(' ', '_').replace('/', '_')}" + s3_key = f"raw_data/{file_id}" # Upload to S3 s3.put_object( Bucket=BUCKET_NAME, From 4c9a2612cfd6713728b4044462ffffe20ef65536 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 21:34:30 +0530 Subject: [PATCH 075/114] remove headers remove headers --- backend/raw_data_handler/raw_data_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 8e63f57..feb2b66 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -33,7 +33,7 @@ def lambda_handler(event, context): print(f"Processing article: {article['Title']}") output_csv = io.StringIO() writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) - writer.writeheader() + # writer.writeheader() writer.writerow(article) article_id = str(uuid.uuid4()) # Generate unique filename From 1eef5808df68f91c423b41be23c924d0c3c0025a Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 22:06:09 +0530 Subject: [PATCH 076/114] endpoint test endpoint test --- .../raw_data_handler/raw_data_handler copy.py | 198 ++++++++++++++++++ backend/raw_data_handler/raw_data_handler.py | 133 +++++++----- 2 files changed, 280 insertions(+), 51 deletions(-) create mode 100644 backend/raw_data_handler/raw_data_handler copy.py diff --git a/backend/raw_data_handler/raw_data_handler copy.py b/backend/raw_data_handler/raw_data_handler copy.py new file mode 100644 index 0000000..e235d6f --- /dev/null +++ b/backend/raw_data_handler/raw_data_handler copy.py @@ -0,0 +1,198 @@ +import base64 +import json +import time +import uuid +from Utils import get_postgresql_connection +from fastapi import FastAPI +from docx import Document +import csv +import io +import re +import boto3 +import traceback + +BUCKET_NAME = 'awstraindata' +role = 'arn:aws:iam::269854564686:role/hackathon-comprehend-role' +def lambda_handler(event, context): + try: + conn = get_postgresql_connection() + cursor = conn.cursor() + comprehend = boto3.client('comprehend', region_name='us-east-1') + for record in event['Records']: + print(f"New record: {record}") + bucket = record['s3']['bucket']['name'] + key = record['s3']['object']['key'] + print(f"Processing file from bucket: {bucket}, key: {key}") + s3 = boto3.client('s3') + print(f"Connecting to S3 bucket: {bucket}") + obj = s3.get_object(Bucket=bucket, Key=key) + stream = io.BytesIO(obj['Body'].read()) + articles = extract_articles(stream) + print(f"Extracted {len(articles)} articles from part") + for article in articles: + print(f"Processing article: {article['Title']}") + output_csv = io.StringIO() + writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) + # writer.writeheader() + writer.writerow(article) + article_id = str(uuid.uuid4()) + # Generate unique filename + csv_filename = f"input/articles-{article_id}.csv" + cursor.execute(""" + INSERT INTO articles (article_id, title, body, source, published_date) + VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) + # Upload to S3 + print(f"Uploading CSV to S3: {csv_filename}") + s3.put_object( + Bucket=BUCKET_NAME, + Key=csv_filename, + Body=output_csv.getvalue(), + ContentType='text/csv' + ) + conn.commit() + start_jobs(f's3://{BUCKET_NAME}/{csv_filename}', article_id, comprehend, role, cursor, conn) + cursor.close() + conn.close() + except Exception as e: + traceback.print_exc() + print(f"Error processing event: {e}") +def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): + entities_response = comprehend.detect_entities( + Text=data['Content'], + DataAccessRoleArn=role_arn, + LanguageCode='en' + ) + add_entities_to_article(conn, cursor, articles_id, entities_response['Entities']) + for entity in entities_response['Entities']: + print(f"Found entity: {entity['Text']} (Type: {entity['Type']})") + +def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): + print(f"Starting jobs for article ID: {articles_id}") + entities_job = comprehend.start_entities_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) + ) + print(f"Entities job started: {entities_job['JobId']}") + result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) + print(f"Entities job description: {result}") + entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] + + # SENTIMENT detection job + sentiment_job = comprehend.start_sentiment_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) + ) + print(f"Sentiment job started: {sentiment_job['JobId']}") + res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) + print(f"Sentiment job description: {res}") + sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] + + # KEY PHRASES detection job + phrases_job = comprehend.start_key_phrases_detection_job( + InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, + OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, + DataAccessRoleArn=role_arn, + LanguageCode='en', + JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) + ) + print(f"Key Phrases job started: {phrases_job['JobId']}") + res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) + print(f"Key Phrases job description: {res}") + key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] + print("Entities Job Response:", entities_output) + print("Sentiment Job Response:", sentiment_output) + print("Key Phrases Job Response:", key_phrases_output) + print("Inserting into comprehend_jobs table") + cursor.execute(""" + INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) + VALUES (%s, %s, %s, %s, %s)""", (articles_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) + conn.commit() + +def extract_articles(file_stream): + print(f"Extracting articles from file stream") + doc = Document(file_stream) + print(f"Document loaded with {len(doc.paragraphs)} paragraphs") + text = "\n".join(p.text for p in doc.paragraphs) + pattern = re.compile( + r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', + re.DOTALL + ) + matches = pattern.findall(text) + print(f"Found {len(matches)} matches in the document") + articles = [] + for match in matches: + print(f"Processing match: {match}") + title = match[0].strip() + source = match[1].strip() + date_parts = match[2].strip().split("\n", 1) + date = date_parts[0].strip() + content = date_parts[1].strip() if len(date_parts) > 1 else "" + print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") + articles.append({ + "Title": title, + "Source": source, + "Date": date, + "Content": content + }) + return articles + + +def add_entities_to_article(conn, cursor, article_id, entities): + entities_text = [entity['Text'] for entity in entities] + print(f"Entities to be added: {entities_text}") + cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) + entity_db_array = cursor.fetchall() + print(f"Entities in DB: {entity_db_array}") + location_mentions = [] + officials_involved = [] + relevance_category = [] + print(f"article_id: {article_id}") + + print(f"Relevance category: {relevance_category}") + for entity in entities: + print(f"Processing entity: {entity}") + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] + print(f"Entity in DB: {entity_in_db}") + if not entity_in_db: + current_time = datetime.datetime.utcnow() + cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + conn.commit() + db_entity = cursor.fetchone() + print(f"Inserted new entity: {db_entity}") + if entity['Type'] == 'LOCATION': + location_mentions.append(db_entity[0]) + elif entity['Type'] == 'PERSON': + officials_involved.append(db_entity[0]) + else: + relevance_category.append(db_entity[0]) + else: + print(f"Entity already exists in DB: {entity_in_db}") + if entity['Type'] == 'LOCATION': + location_mentions.append(entity_in_db[0][0]) + elif entity['Type'] == 'PERSON': + officials_involved.append(entity_in_db[0][0]) + else: + relevance_category.append(entity_in_db[0][0]) + if location_mentions: + location_mentions = ','.join(map(str, location_mentions)) + cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) + + if officials_involved: + officials_involved = ','.join(map(str, officials_involved)) + cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) + + if relevance_category: + cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) + existing = cursor.fetchone() + relevance_category = ','.join(map(str, relevance_category)) + if existing[0] is not None: + print(f"Existing relevance category: {existing[0]}") + relevance_category = relevance_category + ',' + existing[0] + cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) + cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) \ No newline at end of file diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index feb2b66..942947c 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -43,67 +43,43 @@ def lambda_handler(event, context): VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) # Upload to S3 print(f"Uploading CSV to S3: {csv_filename}") - s3.put_object( - Bucket=BUCKET_NAME, - Key=csv_filename, - Body=output_csv.getvalue(), - ContentType='text/csv' - ) + # s3.put_object( + # Bucket=BUCKET_NAME, + # Key=csv_filename, + # Body=output_csv.getvalue(), + # ContentType='text/csv' + # ) conn.commit() - start_jobs(f's3://{BUCKET_NAME}/{csv_filename}', article_id, comprehend, role, cursor, conn) + get_data_inline(output_csv.getvalue(), article_id, comprehend, role, cursor, conn) cursor.close() conn.close() except Exception as e: traceback.print_exc() print(f"Error processing event: {e}") -def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): - print(f"Starting jobs for article ID: {articles_id}") - entities_job = comprehend.start_entities_detection_job( - InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, +def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): + entities_response = comprehend.detect_entities( + Text=data['Content'], DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) + LanguageCode='en' ) - print(f"Entities job started: {entities_job['JobId']}") - result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) - print(f"Entities job description: {result}") - entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] - - # SENTIMENT detection job - sentiment_job = comprehend.start_sentiment_detection_job( - InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) - ) - print(f"Sentiment job started: {sentiment_job['JobId']}") - res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) - print(f"Sentiment job description: {res}") - sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] - - # KEY PHRASES detection job - phrases_job = comprehend.start_key_phrases_detection_job( - InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) + add_entities_to_article(conn, cursor, articles_id, entities_response['Entities']) + response = comprehend.detect_key_phrases( + Text=data['Content'], + DataAccessRoleArn=role_arn, + LanguageCode='en' + ) + for keyPhrase in response['KeyPhrases']: + keyPhrase['Type'] = 'KeyPhrase' + add_entities_to_article(conn, cursor, articles_id, response['KeyPhrases']) + sentiment_response = comprehend.detect_sentiment( + Text=data['Content'], + DataAccessRoleArn=role_arn, + LanguageCode='en' ) - print(f"Key Phrases job started: {phrases_job['JobId']}") - res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) - print(f"Key Phrases job description: {res}") - key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] - print("Entities Job Response:", entities_output) - print("Sentiment Job Response:", sentiment_output) - print("Key Phrases Job Response:", key_phrases_output) - print("Inserting into comprehend_jobs table") - cursor.execute(""" - INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) - VALUES (%s, %s, %s, %s, %s)""", (articles_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) - conn.commit() + sentiment = sentiment_response['Sentiment'] + if sentiment: + cursor.execute("""update articles set sentiment = %s where article_id = %s""", (sentiment, articles_id)) def extract_articles(file_stream): print(f"Extracting articles from file stream") @@ -132,3 +108,58 @@ def extract_articles(file_stream): "Content": content }) return articles + + +def add_entities_to_article(conn, cursor, article_id, entities): + entities_text = [entity['Text'] for entity in entities] + print(f"Entities to be added: {entities_text}") + cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) + entity_db_array = cursor.fetchall() + print(f"Entities in DB: {entity_db_array}") + location_mentions = [] + officials_involved = [] + relevance_category = [] + print(f"article_id: {article_id}") + + print(f"Relevance category: {relevance_category}") + for entity in entities: + print(f"Processing entity: {entity}") + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] + print(f"Entity in DB: {entity_in_db}") + if not entity_in_db: + current_time = datetime.datetime.utcnow() + cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + conn.commit() + db_entity = cursor.fetchone() + print(f"Inserted new entity: {db_entity}") + if entity['Type'] == 'LOCATION': + location_mentions.append(db_entity[0]) + elif entity['Type'] == 'PERSON': + officials_involved.append(db_entity[0]) + else: + relevance_category.append(db_entity[0]) + else: + print(f"Entity already exists in DB: {entity_in_db}") + if entity['Type'] == 'LOCATION': + location_mentions.append(entity_in_db[0][0]) + elif entity['Type'] == 'PERSON': + officials_involved.append(entity_in_db[0][0]) + else: + relevance_category.append(entity_in_db[0][0]) + if location_mentions: + location_mentions = ','.join(map(str, location_mentions)) + cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) + + if officials_involved: + officials_involved = ','.join(map(str, officials_involved)) + cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) + + if relevance_category: + cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) + existing = cursor.fetchone() + relevance_category = ','.join(map(str, relevance_category)) + if existing[0] is not None: + print(f"Existing relevance category: {existing[0]}") + relevance_category = relevance_category + ',' + existing[0] + cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) + cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) \ No newline at end of file From 1d12c227e50df49fe04243d391b55b10b376d65c Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 22:12:09 +0530 Subject: [PATCH 077/114] error fix error fix --- backend/raw_data_handler/raw_data_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 942947c..26d15ef 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -59,13 +59,13 @@ def lambda_handler(event, context): def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): entities_response = comprehend.detect_entities( - Text=data['Content'], + Text=data, DataAccessRoleArn=role_arn, LanguageCode='en' ) add_entities_to_article(conn, cursor, articles_id, entities_response['Entities']) response = comprehend.detect_key_phrases( - Text=data['Content'], + Text=data, DataAccessRoleArn=role_arn, LanguageCode='en' ) @@ -73,7 +73,7 @@ def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): keyPhrase['Type'] = 'KeyPhrase' add_entities_to_article(conn, cursor, articles_id, response['KeyPhrases']) sentiment_response = comprehend.detect_sentiment( - Text=data['Content'], + Text=data, DataAccessRoleArn=role_arn, LanguageCode='en' ) From b5803c3a58cfd28bbf0e5b8a6316b695181733d8 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 22:15:57 +0530 Subject: [PATCH 078/114] debug logs --- backend/raw_data_handler/raw_data_handler.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 26d15ef..fb5a9b0 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -58,17 +58,20 @@ def lambda_handler(event, context): print(f"Error processing event: {e}") def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): + print(f"Processing data for article ID: {articles_id}") entities_response = comprehend.detect_entities( Text=data, DataAccessRoleArn=role_arn, LanguageCode='en' ) + print(f"Entities detected: {entities_response['Entities']}") add_entities_to_article(conn, cursor, articles_id, entities_response['Entities']) response = comprehend.detect_key_phrases( Text=data, DataAccessRoleArn=role_arn, LanguageCode='en' ) + print(f"Key phrases detected: {response['KeyPhrases']}") for keyPhrase in response['KeyPhrases']: keyPhrase['Type'] = 'KeyPhrase' add_entities_to_article(conn, cursor, articles_id, response['KeyPhrases']) @@ -77,6 +80,7 @@ def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): DataAccessRoleArn=role_arn, LanguageCode='en' ) + print(f"Sentiment detected: {sentiment_response['Sentiment']}") sentiment = sentiment_response['Sentiment'] if sentiment: cursor.execute("""update articles set sentiment = %s where article_id = %s""", (sentiment, articles_id)) From f00853dc6b8523ed4ea8ee0e4a81c3c1e45b988c Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 22:19:40 +0530 Subject: [PATCH 079/114] IAM role update --- backend/raw_data_handler/raw_data_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index fb5a9b0..1910207 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -61,14 +61,14 @@ def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): print(f"Processing data for article ID: {articles_id}") entities_response = comprehend.detect_entities( Text=data, - DataAccessRoleArn=role_arn, + # DataAccessRoleArn=role_arn, LanguageCode='en' ) print(f"Entities detected: {entities_response['Entities']}") add_entities_to_article(conn, cursor, articles_id, entities_response['Entities']) response = comprehend.detect_key_phrases( Text=data, - DataAccessRoleArn=role_arn, + # DataAccessRoleArn=role_arn, LanguageCode='en' ) print(f"Key phrases detected: {response['KeyPhrases']}") @@ -77,7 +77,7 @@ def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): add_entities_to_article(conn, cursor, articles_id, response['KeyPhrases']) sentiment_response = comprehend.detect_sentiment( Text=data, - DataAccessRoleArn=role_arn, + # DataAccessRoleArn=role_arn, LanguageCode='en' ) print(f"Sentiment detected: {sentiment_response['Sentiment']}") From c207175d01c2650dd3e52f180cefc66307b80b75 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 22:26:30 +0530 Subject: [PATCH 080/114] datetime fix --- backend/raw_data_handler/raw_data_handler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 1910207..254d8ba 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -1,4 +1,5 @@ import base64 +import datetime import json import time import uuid From 499d8eaa1d8e3273b7b1579ed221d8b013af7330 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 22:41:26 +0530 Subject: [PATCH 081/114] Pushing changes --- backend/requirements.txt | 5 ++- backend/web_api/database.py | 8 ++-- backend/web_api/models.py | 78 +++++++++++++++++++++++-------------- backend/web_api/web_api.py | 23 ++++++----- 4 files changed, 69 insertions(+), 45 deletions(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 51bd887..8051bbe 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,4 +8,7 @@ pydantic python-dotenv requests_toolbelt sqlmodel -psycopg2-binary \ No newline at end of file +# psycopg2-binary +mangum +psycopg[binary] # The modern v3 driver +psycopg-pool # The connection pool for v3 \ No newline at end of file diff --git a/backend/web_api/database.py b/backend/web_api/database.py index df5feb5..9a01499 100644 --- a/backend/web_api/database.py +++ b/backend/web_api/database.py @@ -12,10 +12,10 @@ # For PostgreSQL, you can remove it. engine = create_engine(DATABASE_URL, echo=True) -def create_db_and_tables(): - # This function creates all tables defined by SQLModel models - # that are subclasses of SQLModel. It's good to run this once at startup. - SQLModel.metadata.create_all(engine) +# def create_db_and_tables(): +# # This function creates all tables defined by SQLModel models +# # that are subclasses of SQLModel. It's good to run this once at startup. +# SQLModel.metadata.create_all(engine) # Dependency function to get a database session def get_session(): diff --git a/backend/web_api/models.py b/backend/web_api/models.py index 5f3235d..206906f 100644 --- a/backend/web_api/models.py +++ b/backend/web_api/models.py @@ -1,39 +1,57 @@ # models.py from typing import Optional from sqlmodel import Field, SQLModel +import datetime class Articles(SQLModel, table=True): - articles_id: Optional[str] = Field(default=None, primary_key=True) - title: str = Field(index=True) - body: str - source: str - published_date: Optional[str] = Field(default=None, index=True) - # location_mentions: Optional[str] = Field(default=None, index=True) - # officials_involved: Optional[str] = Field(default=None, index=True) - # relenace_category: Optional[str] = Field(default=None, index=True) - sentiment: Optional[str] = Field(default=None, index=True) - # name: str = Field(index=True) - # secret_name: str - # age: Optional[int] = Field(default=None, index=True) + article_id: str = Field(primary_key=True) + title: Optional[str] = None + body: Optional[str] = None + source: Optional[str] = Field(default=None, alias="source") + published_date: Optional[str] = None + location_mentions: Optional[str] = None + officials_involved: Optional[str] = None + relevance_category: Optional[str] = None + sentiment: Optional[str] = None +class ArticleCreate(SQLModel): + title: Optional[str] = None + body: Optional[str] = None + source: Optional[str] = None + published_date: Optional[str] = None + location_mentions: Optional[str] = None + officials_involved: Optional[str] = None + relevance_category: Optional[str] = None + sentiment: Optional[str] = None -class ArticleCreate(Articles): - pass +class ArticleRead(SQLModel): + article_id: str + title: Optional[str] = None + body: Optional[str] = None + source: Optional[str] = None + published_date: Optional[str] = None + location_mentions: Optional[str] = None + officials_involved: Optional[str] = None + relevance_category: Optional[str] = None + sentiment: Optional[str] = None -class ArticleRead(Articles): - articles_id: str - - class Clusters(SQLModel, table=True): - id: Optional[str] = Field(default=None, primary_key=True) - title: str = Field(index=True) - linkedarticles: Optional[str] = Field(default=None, index=True) - startdate: Optional[str] = Field(default=None, index=True) - enddate: Optional[str] = Field(default=None, index=True) - -class ClusterCreate(Clusters): - pass - -class ClusterRead(Clusters): - id: str - + id: int = Field(default=None, primary_key=True) + title: Optional[datetime.date] = None + linkedarticles: Optional[str] = None + startdate: Optional[str] = None + enddate: Optional[str] = None + +class ClusterCreate(SQLModel): + title: Optional[datetime.date] = None + linkedarticles: Optional[str] = None + startdate: Optional[str] = None + enddate: Optional[str] = None + +class ClusterRead(SQLModel): + id: int + title: Optional[datetime.date] = None + linkedarticles: Optional[str] = None + startdate: Optional[str] = None + enddate: Optional[str] = None + diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index c0bb765..7568cd4 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -5,8 +5,8 @@ from fastapi import FastAPI, Depends, HTTPException, Response from sqlmodel import Session, select -from database import get_session, create_db_and_tables -from models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead +from .database import get_session +from .models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead from typing import List, Dict, Any from contextlib import asynccontextmanager, closing @@ -14,7 +14,8 @@ # from fastapi import FastAPI, HTTPException, Depends from pydantic import BaseModel, Field -from psycopg2 import connect, ProgrammingError +from psycopg import ProgrammingError +from psycopg_pool import ConnectionPool from contextlib import closing from mangum import Mangum @@ -22,7 +23,7 @@ # from bedrock_agent import generate_sql_from_prompt # Import our new agent invoker function -from bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql +from .bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql # Load environment variables from .env file from dotenv import load_dotenv @@ -33,13 +34,17 @@ AGENT_ID = os.environ.get("BEDROCK_AGENT_ID") AGENT_ALIAS_ID = os.environ.get("BEDROCK_AGENT_ALIAS_ID", "TSTALIASID") # TSTALIASID is a common default -app = FastAPI() +app = FastAPI( + title="FastAPI with Bedrock Agents", + redirect_slashes=True, +) @asynccontextmanager async def lifespan(app: FastAPI): print("Application startup...") yield print("Application shutdown...") + pool.close() # This event handler runs once when the application starts. @@ -118,14 +123,12 @@ class NaturalLanguageQuery(BaseModel): # IMPORTANT: Use a read-only user for the database connection. DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb") +pool = ConnectionPool(conninfo=DATABASE_URL) + def get_db_connection(): - with connect(DATABASE_URL) as conn: + with pool.connection() as conn: yield conn -app = FastAPI( - title="FastAPI with Bedrock Agents" -) - @app.post("/query/agent", response_model=List[Dict[str, Any]]) def query_with_bedrock_agent(query: NaturalLanguageQuery, conn=Depends(get_db_connection)): """ From a932cc44b67c515afbda531d8002466ef92dbbee Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 22:49:25 +0530 Subject: [PATCH 082/114] new keyphrase table new keyphrase table --- backend/raw_data_handler/raw_data_handler.py | 64 ++++++++++++++------ 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 254d8ba..a371b3d 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -75,7 +75,7 @@ def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): print(f"Key phrases detected: {response['KeyPhrases']}") for keyPhrase in response['KeyPhrases']: keyPhrase['Type'] = 'KeyPhrase' - add_entities_to_article(conn, cursor, articles_id, response['KeyPhrases']) + add_keyphrase_to_article(conn, cursor, articles_id, response['KeyPhrases']) sentiment_response = comprehend.detect_sentiment( Text=data, # DataAccessRoleArn=role_arn, @@ -115,6 +115,32 @@ def extract_articles(file_stream): return articles +def add_keyphrase_to_article(conn, cursor, article_id, entities): + entities_text = [entity['Text'] for entity in entities] + print(f"Entities to be added: {entities_text}") + cursor.execute("SELECT * FROM keyphrases WHERE phrases in %s", (tuple(entities_text),)) + entity_db_array = cursor.fetchall() + print(f"Entities in DB: {entity_db_array}") + relevance_category = [] + print(f"article_id: {article_id}") + for entity in entities: + print(f"Processing entity: {entity}") + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] + print(f"Entity in DB: {entity_in_db}") + if not entity_in_db: + current_time = datetime.datetime.utcnow() + cursor.execute("INSERT INTO keyphrases (create_time,phrases,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + conn.commit() + db_entity = cursor.fetchone() + print(f"Inserted new entity: {db_entity}") + relevance_category.append(db_entity[0]) + else: + print(f"Entity already exists in DB: {entity_in_db}") + relevance_category.append(entity_in_db[0][0]) + if relevance_category: + relevance_category = ','.join(map(str, relevance_category)) + cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) + def add_entities_to_article(conn, cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") @@ -123,10 +149,10 @@ def add_entities_to_article(conn, cursor, article_id, entities): print(f"Entities in DB: {entity_db_array}") location_mentions = [] officials_involved = [] - relevance_category = [] + # relevance_category = [] print(f"article_id: {article_id}") - - print(f"Relevance category: {relevance_category}") + + # print(f"Relevance category: {relevance_category}") for entity in entities: print(f"Processing entity: {entity}") entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] @@ -139,18 +165,18 @@ def add_entities_to_article(conn, cursor, article_id, entities): print(f"Inserted new entity: {db_entity}") if entity['Type'] == 'LOCATION': location_mentions.append(db_entity[0]) - elif entity['Type'] == 'PERSON': + elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': officials_involved.append(db_entity[0]) - else: - relevance_category.append(db_entity[0]) + # else: + # relevance_category.append(db_entity[0]) else: print(f"Entity already exists in DB: {entity_in_db}") if entity['Type'] == 'LOCATION': location_mentions.append(entity_in_db[0][0]) - elif entity['Type'] == 'PERSON': + elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': officials_involved.append(entity_in_db[0][0]) - else: - relevance_category.append(entity_in_db[0][0]) + # else: + # relevance_category.append(entity_in_db[0][0]) if location_mentions: location_mentions = ','.join(map(str, location_mentions)) cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) @@ -159,12 +185,12 @@ def add_entities_to_article(conn, cursor, article_id, entities): officials_involved = ','.join(map(str, officials_involved)) cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) - if relevance_category: - cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) - existing = cursor.fetchone() - relevance_category = ','.join(map(str, relevance_category)) - if existing[0] is not None: - print(f"Existing relevance category: {existing[0]}") - relevance_category = relevance_category + ',' + existing[0] - cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) - cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) \ No newline at end of file + # if relevance_category: + # cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) + # existing = cursor.fetchone() + # relevance_category = ','.join(map(str, relevance_category)) + # if existing[0] is not None: + # print(f"Existing relevance category: {existing[0]}") + # relevance_category = relevance_category + ',' + existing[0] + # cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) + # cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) \ No newline at end of file From 0c4dc880ee29a10fa71392c55696d7887504bf8c Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 22:53:41 +0530 Subject: [PATCH 083/114] library fix library fix --- backend/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 51bd887..247a762 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,4 +8,4 @@ pydantic python-dotenv requests_toolbelt sqlmodel -psycopg2-binary \ No newline at end of file +psycopg2-binary==2.9.9 \ No newline at end of file From 0e9734cdfbb7fedbbf2df12c66b09ac99eaaf85d Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 22:58:31 +0530 Subject: [PATCH 084/114] adding new lambda --- backend/cluster-prioritization/Utils.py | 25 +++ .../cluster-prioritization.py | 148 ++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 backend/cluster-prioritization/Utils.py create mode 100644 backend/cluster-prioritization/cluster-prioritization.py diff --git a/backend/cluster-prioritization/Utils.py b/backend/cluster-prioritization/Utils.py new file mode 100644 index 0000000..41ed2b4 --- /dev/null +++ b/backend/cluster-prioritization/Utils.py @@ -0,0 +1,25 @@ +import json +import psycopg2 +def get_postgresql_connection(): + '''get the creds from local config''' + + """ + Establish a connection to a PostgreSQL database. + + Parameters: + host (str): The hostname of the PostgreSQL server. + database (str): The name of the database to connect to. + user (str): The username to connect with. + password (str): The password for the user. + + Returns: + psycopg2.extensions.connection: A connection object to the PostgreSQL database. + """ + try: + with open("pg_config.json") as f: + config = json.load(f) + conn = psycopg2.connect(**config) + return conn + except psycopg2.Error as e: + print("Error connecting to PostgreSQL database:", e) + return None \ No newline at end of file diff --git a/backend/cluster-prioritization/cluster-prioritization.py b/backend/cluster-prioritization/cluster-prioritization.py new file mode 100644 index 0000000..8669089 --- /dev/null +++ b/backend/cluster-prioritization/cluster-prioritization.py @@ -0,0 +1,148 @@ +import boto3 +import tarfile +import json +import io +from Utils import get_postgresql_connection +import datetime + + + +def lambda_handler(event, context): + try: + for record in event['Records']: + print(f"New record: {record}") + bucket = record['s3']['bucket']['name'] + key = record['s3']['object']['key'] + print(f"Processing file from bucket: {bucket}, key: {key}") + conn = get_postgresql_connection() + s3 = boto3.client('s3') + print(f"Connecting to S3 bucket: {bucket}") + obj = s3.get_object(Bucket=bucket, Key=key) + print(f"Downloaded object from S3: {key}") + tar_bytes = io.BytesIO(obj['Body'].read()) + print(f"Processing file: {key}") + # Extract .json inside the tar.gz + with tarfile.open(fileobj=tar_bytes, mode='r:gz') as tar: + print(f"Extracting files from tar: {key}") + for member in tar.getmembers(): + print(f"Found member: {member.name}") + if member.name == "output" and member.isfile(): + print(f"Extracting JSON file: {member.name}") + file = tar.extractfile(member) + if not file: + print(f"File {member.name} not found in tar.") + continue + result = json.load(file) + print(f"Extracted JSON: {result}") + break + print(f"Results: {result}") + if result: + print(f"Results found in the file: {key}") + folderSplit = key.split('/') + type = folderSplit[1] + cursor = conn.cursor() + query = "SELECT * FROM comprehend_jobs WHERE entities_path = %s or sentiment_path = %s or key_phrases_path = %s" + cursor.execute(query, (key, key, key)) + row = cursor.fetchone() + print(f"Row found: {row}") + print(f"Type of analysis: {type}") + if row: + article_id = row[0] + print(f"Article ID: {article_id}") + if type == 'entities': + entity_array = result['Entities'] + if entity_array: + ## get the entities from the entities table + add_entities_to_article(conn, cursor, article_id, entity_array) + elif type == 'keyphrases': + keyPhrases_array = result['KeyPhrases'] + if keyPhrases_array: + for keyPhrase in keyPhrases_array: + keyPhrase['Type'] = 'KeyPhrase' + add_entities_to_article(conn, cursor, article_id, keyPhrases_array) + elif type == 'sentiment': + sentiment = result.get('Sentiment', 'NEUTRAL') + if sentiment: + cursor.execute("""update articles set sentiment = %s where article_id = %s""", (sentiment, article_id)) + conn.commit() + cursor.close() + ## delete the s3 object + # s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) + conn.close() + except Exception as e: + print(f"Error processing record: {e}") + return { + 'statusCode': 500, + 'body': json.dumps({'error': str(e)}) + } + +def add_entities_to_article(conn, cursor, article_id, entities): + entities_text = [entity['Text'] for entity in entities] + print(f"Entities to be added: {entities_text}") + cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) + entity_db_array = cursor.fetchall() + print(f"Entities in DB: {entity_db_array}") + location_mentions = [] + officials_involved = [] + relevance_category = [] + print(f"article_id: {article_id}") + + print(f"Relevance category: {relevance_category}") + for entity in entities: + print(f"Processing entity: {entity}") + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] + print(f"Entity in DB: {entity_in_db}") + if not entity_in_db: + current_time = datetime.datetime.utcnow() + cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + conn.commit() + db_entity = cursor.fetchone() + print(f"Inserted new entity: {db_entity}") + if entity['Type'] == 'LOCATION': + location_mentions.append(db_entity[0]) + elif entity['Type'] == 'PERSON': + officials_involved.append(db_entity[0]) + else: + relevance_category.append(db_entity[0]) + else: + print(f"Entity already exists in DB: {entity_in_db}") + if entity['Type'] == 'LOCATION': + location_mentions.append(entity_in_db[0][0]) + elif entity['Type'] == 'PERSON': + officials_involved.append(entity_in_db[0][0]) + else: + relevance_category.append(entity_in_db[0][0]) + if location_mentions: + location_mentions = ','.join(map(str, location_mentions)) + cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) + + if officials_involved: + officials_involved = ','.join(map(str, officials_involved)) + cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) + + if relevance_category: + cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) + existing = cursor.fetchone() + relevance_category = ','.join(map(str, relevance_category)) + if existing[0] is not None: + print(f"Existing relevance category: {existing[0]}") + relevance_category = relevance_category + ',' + existing[0] + cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) + + +# events = [ +# { +# "s3": { +# "bucket": { +# "name": "awstraindata" +# }, +# "object": { +# "key": "output/entities/269854564686-NER-7b5218ec8e556761890504a59e10da02/output/output.tar.gz" +# } +# } +# } +# ] +# obj= { +# "Records": events +# } +# lambda_handler(obj, None) \ No newline at end of file From f9612479b78fca1fdbda7e1919095918ab268026 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 23:06:50 +0530 Subject: [PATCH 085/114] key phrase fix key phrase fix --- backend/raw_data_handler/raw_data_handler.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index a371b3d..bb7e2b4 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -51,14 +51,14 @@ def lambda_handler(event, context): # ContentType='text/csv' # ) conn.commit() - get_data_inline(output_csv.getvalue(), article_id, comprehend, role, cursor, conn) + get_data_inline(output_csv.getvalue(), article_id, article['Date'], comprehend, cursor, conn) cursor.close() conn.close() except Exception as e: traceback.print_exc() print(f"Error processing event: {e}") -def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): +def get_data_inline(data, articles_id, article_date, comprehend, cursor, conn): print(f"Processing data for article ID: {articles_id}") entities_response = comprehend.detect_entities( Text=data, @@ -75,7 +75,7 @@ def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): print(f"Key phrases detected: {response['KeyPhrases']}") for keyPhrase in response['KeyPhrases']: keyPhrase['Type'] = 'KeyPhrase' - add_keyphrase_to_article(conn, cursor, articles_id, response['KeyPhrases']) + add_keyphrase_to_article(conn, cursor, articles_id, article_date, response['KeyPhrases']) sentiment_response = comprehend.detect_sentiment( Text=data, # DataAccessRoleArn=role_arn, @@ -115,7 +115,7 @@ def extract_articles(file_stream): return articles -def add_keyphrase_to_article(conn, cursor, article_id, entities): +def add_keyphrase_to_article(conn, cursor, article_id, article_date, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") cursor.execute("SELECT * FROM keyphrases WHERE phrases in %s", (tuple(entities_text),)) @@ -129,13 +129,18 @@ def add_keyphrase_to_article(conn, cursor, article_id, entities): print(f"Entity in DB: {entity_in_db}") if not entity_in_db: current_time = datetime.datetime.utcnow() - cursor.execute("INSERT INTO keyphrases (create_time,phrases,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + cursor.execute("INSERT INTO keyphrases (start_datetime,phrases) VALUES (%s, %s) RETURNING id", (article_date, entity['Text'])) conn.commit() db_entity = cursor.fetchone() print(f"Inserted new entity: {db_entity}") relevance_category.append(db_entity[0]) else: print(f"Entity already exists in DB: {entity_in_db}") + cursor.execute("""UPDATE keyphrases SET linkedtimelines = + CASE + WHEN linkedtimelines IS NULL OR linkedtimelines = '' THEN %s + ELSE linkedtimelines || ', ' || %s + END WHERE id = %s""", (article_date, article_date, entity_in_db[0][0])) relevance_category.append(entity_in_db[0][0]) if relevance_category: relevance_category = ','.join(map(str, relevance_category)) From 13058265ef724a8e615ead1680486a8646d190b1 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 23:12:26 +0530 Subject: [PATCH 086/114] minor fix --- backend/Submit/Submit.py | 2 -- backend/raw_data_handler/raw_data_handler.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index a0cfc47..74a5ca1 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -33,7 +33,6 @@ def lambda_handler(event, context): # Parse multipart form multipart_data = decoder.MultipartDecoder(body, content_type) print(f"Multipart data parts: {len(multipart_data.parts)}") - s3_urls = [] conn = get_postgresql_connection() cursor = conn.cursor() for part in multipart_data.parts: @@ -56,7 +55,6 @@ def lambda_handler(event, context): }, "body": json.dumps({ "status": "success", - "s3_urls": s3_urls }) } except Exception as e: diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index bb7e2b4..bc2c3e5 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -125,7 +125,7 @@ def add_keyphrase_to_article(conn, cursor, article_id, article_date, entities): print(f"article_id: {article_id}") for entity in entities: print(f"Processing entity: {entity}") - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] + entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[2].lower() == entity['Text'].lower()] print(f"Entity in DB: {entity_in_db}") if not entity_in_db: current_time = datetime.datetime.utcnow() From b3b6b428f3bca306a0b9cf8acc22ed5711d6d6ff Mon Sep 17 00:00:00 2001 From: hackathon Date: Sat, 28 Jun 2025 23:38:00 +0530 Subject: [PATCH 087/114] adding new lambda --- .../cluster-prioritization.py | 160 +++--------------- 1 file changed, 28 insertions(+), 132 deletions(-) diff --git a/backend/cluster-prioritization/cluster-prioritization.py b/backend/cluster-prioritization/cluster-prioritization.py index 8669089..f0a54b0 100644 --- a/backend/cluster-prioritization/cluster-prioritization.py +++ b/backend/cluster-prioritization/cluster-prioritization.py @@ -9,140 +9,36 @@ def lambda_handler(event, context): try: - for record in event['Records']: - print(f"New record: {record}") - bucket = record['s3']['bucket']['name'] - key = record['s3']['object']['key'] - print(f"Processing file from bucket: {bucket}, key: {key}") - conn = get_postgresql_connection() - s3 = boto3.client('s3') - print(f"Connecting to S3 bucket: {bucket}") - obj = s3.get_object(Bucket=bucket, Key=key) - print(f"Downloaded object from S3: {key}") - tar_bytes = io.BytesIO(obj['Body'].read()) - print(f"Processing file: {key}") - # Extract .json inside the tar.gz - with tarfile.open(fileobj=tar_bytes, mode='r:gz') as tar: - print(f"Extracting files from tar: {key}") - for member in tar.getmembers(): - print(f"Found member: {member.name}") - if member.name == "output" and member.isfile(): - print(f"Extracting JSON file: {member.name}") - file = tar.extractfile(member) - if not file: - print(f"File {member.name} not found in tar.") - continue - result = json.load(file) - print(f"Extracted JSON: {result}") - break - print(f"Results: {result}") - if result: - print(f"Results found in the file: {key}") - folderSplit = key.split('/') - type = folderSplit[1] - cursor = conn.cursor() - query = "SELECT * FROM comprehend_jobs WHERE entities_path = %s or sentiment_path = %s or key_phrases_path = %s" - cursor.execute(query, (key, key, key)) - row = cursor.fetchone() - print(f"Row found: {row}") - print(f"Type of analysis: {type}") - if row: - article_id = row[0] - print(f"Article ID: {article_id}") - if type == 'entities': - entity_array = result['Entities'] - if entity_array: - ## get the entities from the entities table - add_entities_to_article(conn, cursor, article_id, entity_array) - elif type == 'keyphrases': - keyPhrases_array = result['KeyPhrases'] - if keyPhrases_array: - for keyPhrase in keyPhrases_array: - keyPhrase['Type'] = 'KeyPhrase' - add_entities_to_article(conn, cursor, article_id, keyPhrases_array) - elif type == 'sentiment': - sentiment = result.get('Sentiment', 'NEUTRAL') - if sentiment: - cursor.execute("""update articles set sentiment = %s where article_id = %s""", (sentiment, article_id)) - conn.commit() - cursor.close() - ## delete the s3 object - # s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) - conn.close() + conn = get_postgresql_connection() + cursor = conn.cursor() + query = "SELECT * FROM clusters WHERE TO_DATE(startdate, 'DD/MM/YYYY') >= CURRENT_DATE ORDER BY clusters.referencecount DESC limit 20;" + cursor.execute(query) + rows = cursor.fetchall() + if rows: + for row in rows: + print(f"ID: {row[0]}, Priority: {row[1]}") + update_query = """ + WITH ranked AS ( + SELECT id, + 21 - ROW_NUMBER() OVER (ORDER BY clusters.referencecount DESC) AS priority + FROM clusters + WHERE TO_DATE(startdate, 'DD/MM/YYYY') >= CURRENT_DATE + LIMIT 20 + ) + UPDATE clusters t + SET priority = r.priority + FROM ranked r + WHERE t.id = r.id; + """ + cursor.execute(update_query) + conn.commit() + cursor.close() + ## delete the s3 object + # s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) + conn.close() except Exception as e: print(f"Error processing record: {e}") return { 'statusCode': 500, 'body': json.dumps({'error': str(e)}) - } - -def add_entities_to_article(conn, cursor, article_id, entities): - entities_text = [entity['Text'] for entity in entities] - print(f"Entities to be added: {entities_text}") - cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) - entity_db_array = cursor.fetchall() - print(f"Entities in DB: {entity_db_array}") - location_mentions = [] - officials_involved = [] - relevance_category = [] - print(f"article_id: {article_id}") - - print(f"Relevance category: {relevance_category}") - for entity in entities: - print(f"Processing entity: {entity}") - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] - print(f"Entity in DB: {entity_in_db}") - if not entity_in_db: - current_time = datetime.datetime.utcnow() - cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) - conn.commit() - db_entity = cursor.fetchone() - print(f"Inserted new entity: {db_entity}") - if entity['Type'] == 'LOCATION': - location_mentions.append(db_entity[0]) - elif entity['Type'] == 'PERSON': - officials_involved.append(db_entity[0]) - else: - relevance_category.append(db_entity[0]) - else: - print(f"Entity already exists in DB: {entity_in_db}") - if entity['Type'] == 'LOCATION': - location_mentions.append(entity_in_db[0][0]) - elif entity['Type'] == 'PERSON': - officials_involved.append(entity_in_db[0][0]) - else: - relevance_category.append(entity_in_db[0][0]) - if location_mentions: - location_mentions = ','.join(map(str, location_mentions)) - cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) - - if officials_involved: - officials_involved = ','.join(map(str, officials_involved)) - cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) - - if relevance_category: - cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) - existing = cursor.fetchone() - relevance_category = ','.join(map(str, relevance_category)) - if existing[0] is not None: - print(f"Existing relevance category: {existing[0]}") - relevance_category = relevance_category + ',' + existing[0] - cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) - - -# events = [ -# { -# "s3": { -# "bucket": { -# "name": "awstraindata" -# }, -# "object": { -# "key": "output/entities/269854564686-NER-7b5218ec8e556761890504a59e10da02/output/output.tar.gz" -# } -# } -# } -# ] -# obj= { -# "Records": events -# } -# lambda_handler(obj, None) \ No newline at end of file + } \ No newline at end of file From fcb2a980ff42bdeff8075c0e0ce0d598b1b14451 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sat, 28 Jun 2025 23:57:01 +0530 Subject: [PATCH 088/114] Trial changes --- backend/requirements.txt | 6 +++--- backend/web_api/web_api.py | 13 +++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 8051bbe..5f74a63 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,7 +8,7 @@ pydantic python-dotenv requests_toolbelt sqlmodel -# psycopg2-binary +psycopg2-binary==2.9.9 mangum -psycopg[binary] # The modern v3 driver -psycopg-pool # The connection pool for v3 \ No newline at end of file +# psycopg[binary] # The modern v3 driver +# psycopg-pool # The connection pool for v3 \ No newline at end of file diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index 7568cd4..ee68838 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -14,8 +14,8 @@ # from fastapi import FastAPI, HTTPException, Depends from pydantic import BaseModel, Field -from psycopg import ProgrammingError -from psycopg_pool import ConnectionPool +from psycopg2 import ProgrammingError +import psycopg2 from contextlib import closing from mangum import Mangum @@ -44,7 +44,7 @@ async def lifespan(app: FastAPI): print("Application startup...") yield print("Application shutdown...") - pool.close() + # pool.close() # This event handler runs once when the application starts. @@ -123,11 +123,12 @@ class NaturalLanguageQuery(BaseModel): # IMPORTANT: Use a read-only user for the database connection. DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb") -pool = ConnectionPool(conninfo=DATABASE_URL) - def get_db_connection(): - with pool.connection() as conn: + conn = psycopg2.connect(DATABASE_URL) + try: yield conn + finally: + conn.close() @app.post("/query/agent", response_model=List[Dict[str, Any]]) def query_with_bedrock_agent(query: NaturalLanguageQuery, conn=Depends(get_db_connection)): From 203eadf6db35ae065934dd429b88dd44c37227cf Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sat, 28 Jun 2025 23:57:45 +0530 Subject: [PATCH 089/114] data format change --- backend/raw_data_handler/raw_data_handler.py | 118 +++++++++---------- 1 file changed, 54 insertions(+), 64 deletions(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index bc2c3e5..8fc43ee 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -118,84 +118,74 @@ def extract_articles(file_stream): def add_keyphrase_to_article(conn, cursor, article_id, article_date, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") - cursor.execute("SELECT * FROM keyphrases WHERE phrases in %s", (tuple(entities_text),)) - entity_db_array = cursor.fetchall() - print(f"Entities in DB: {entity_db_array}") - relevance_category = [] + # cursor.execute("SELECT * FROM keyphrases WHERE phrases in %s", (tuple(entities_text),)) + # entity_db_array = cursor.fetchall() + # print(f"Entities in DB: {entity_db_array}") + # relevance_category = [] print(f"article_id: {article_id}") - for entity in entities: - print(f"Processing entity: {entity}") - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[2].lower() == entity['Text'].lower()] - print(f"Entity in DB: {entity_in_db}") - if not entity_in_db: - current_time = datetime.datetime.utcnow() - cursor.execute("INSERT INTO keyphrases (start_datetime,phrases) VALUES (%s, %s) RETURNING id", (article_date, entity['Text'])) - conn.commit() - db_entity = cursor.fetchone() - print(f"Inserted new entity: {db_entity}") - relevance_category.append(db_entity[0]) - else: - print(f"Entity already exists in DB: {entity_in_db}") - cursor.execute("""UPDATE keyphrases SET linkedtimelines = - CASE - WHEN linkedtimelines IS NULL OR linkedtimelines = '' THEN %s - ELSE linkedtimelines || ', ' || %s - END WHERE id = %s""", (article_date, article_date, entity_in_db[0][0])) - relevance_category.append(entity_in_db[0][0]) - if relevance_category: - relevance_category = ','.join(map(str, relevance_category)) + # for entity in entities: + # print(f"Processing entity: {entity}") + # entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[2].lower() == entity['Text'].lower()] + # print(f"Entity in DB: {entity_in_db}") + # if not entity_in_db: + # cursor.execute("INSERT INTO keyphrases (start_datetime,phrases) VALUES (%s, %s) RETURNING id", (article_date, entity['Text'])) + # conn.commit() + # db_entity = cursor.fetchone() + # print(f"Inserted new entity: {db_entity}") + # relevance_category.append(db_entity[0]) + # else: + # print(f"Entity already exists in DB: {entity_in_db}") + # cursor.execute("""UPDATE keyphrases SET linkedtimelines = + # CASE + # WHEN linkedtimelines IS NULL OR linkedtimelines = '' THEN %s + # ELSE linkedtimelines || ', ' || %s + # END WHERE id = %s""", (article_date, article_date, entity_in_db[0][0])) + # relevance_category.append(entity_in_db[0][0]) + if entities_text: + relevance_category = ','.join(map(str, entities_text)) cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) def add_entities_to_article(conn, cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") - cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) - entity_db_array = cursor.fetchall() - print(f"Entities in DB: {entity_db_array}") + # cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) + # entity_db_array = cursor.fetchall() + # print(f"Entities in DB: {entity_db_array}") location_mentions = [] officials_involved = [] # relevance_category = [] print(f"article_id: {article_id}") - - # print(f"Relevance category: {relevance_category}") + + # for entity in entities: + # print(f"Processing entity: {entity}") + # entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] + # print(f"Entity in DB: {entity_in_db}") + # if not entity_in_db: + # current_time = datetime.datetime.utcnow() + # cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) + # conn.commit() + # db_entity = cursor.fetchone() + # print(f"Inserted new entity: {db_entity}") + # if entity['Type'] == 'LOCATION': + # location_mentions.append(db_entity[0]) + # elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': + # officials_involved.append(db_entity[0]) + # else: + # print(f"Entity already exists in DB: {entity_in_db}") + # if entity['Type'] == 'LOCATION': + # location_mentions.append(entity_in_db[0][0]) + # elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': + # officials_involved.append(entity_in_db[0][0]) for entity in entities: - print(f"Processing entity: {entity}") - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] - print(f"Entity in DB: {entity_in_db}") - if not entity_in_db: - current_time = datetime.datetime.utcnow() - cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) - conn.commit() - db_entity = cursor.fetchone() - print(f"Inserted new entity: {db_entity}") - if entity['Type'] == 'LOCATION': - location_mentions.append(db_entity[0]) - elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': - officials_involved.append(db_entity[0]) - # else: - # relevance_category.append(db_entity[0]) - else: - print(f"Entity already exists in DB: {entity_in_db}") - if entity['Type'] == 'LOCATION': - location_mentions.append(entity_in_db[0][0]) - elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': - officials_involved.append(entity_in_db[0][0]) - # else: - # relevance_category.append(entity_in_db[0][0]) + if entity['Type'] == 'LOCATION': + location_mentions.append(entity['Text'].lower()) + elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': + officials_involved.append(entity['Text'].lower()) + print(f"Processing entity: {entity}") if location_mentions: location_mentions = ','.join(map(str, location_mentions)) cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) if officials_involved: officials_involved = ','.join(map(str, officials_involved)) - cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) - - # if relevance_category: - # cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) - # existing = cursor.fetchone() - # relevance_category = ','.join(map(str, relevance_category)) - # if existing[0] is not None: - # print(f"Existing relevance category: {existing[0]}") - # relevance_category = relevance_category + ',' + existing[0] - # cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) - # cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) \ No newline at end of file + cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) \ No newline at end of file From 217a9414ad173945726384326c16bdc10dfc5ac3 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 00:30:55 +0530 Subject: [PATCH 090/114] pushing changes --- backend/web_api/web_api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index ee68838..7126648 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -5,8 +5,8 @@ from fastapi import FastAPI, Depends, HTTPException, Response from sqlmodel import Session, select -from .database import get_session -from .models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead +from backend.web_api.database import get_session +from backend.web_api.models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead from typing import List, Dict, Any from contextlib import asynccontextmanager, closing @@ -23,7 +23,7 @@ # from bedrock_agent import generate_sql_from_prompt # Import our new agent invoker function -from .bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql +from backend.web_api.bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql # Load environment variables from .env file from dotenv import load_dotenv From d80c875afc504510a2f3f2b6d784b3a5a690fcf5 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 00:43:24 +0530 Subject: [PATCH 091/114] pushing changes --- backend/web_api/web_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index 7126648..2b8b072 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -21,7 +21,7 @@ # # Import our new Bedrock agent function # from bedrock_agent import generate_sql_from_prompt - +# namasthe # Import our new agent invoker function from backend.web_api.bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql From bdb13b2a6a377433ed8279533d75023c9d8a7144 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sun, 29 Jun 2025 01:35:12 +0530 Subject: [PATCH 092/114] adding an api --- api/query-agent.json | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 api/query-agent.json diff --git a/api/query-agent.json b/api/query-agent.json new file mode 100644 index 0000000..602606a --- /dev/null +++ b/api/query-agent.json @@ -0,0 +1,10 @@ +{ + "apiName": "hackathon", + "resourcePath": "query/agent", + "method": { + "httpMethod": "POST", + "authorizationType": "NONE", + "lambdaFunctionName": "web_api" + }, + "deploy": true +} \ No newline at end of file From 7a7a25c62a6b3875da22bbe9d41ddf3f48e7d4de Mon Sep 17 00:00:00 2001 From: hackathon Date: Sun, 29 Jun 2025 01:36:50 +0530 Subject: [PATCH 093/114] adding an api --- api/query-agent.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/query-agent.json b/api/query-agent.json index 602606a..8dd8ff2 100644 --- a/api/query-agent.json +++ b/api/query-agent.json @@ -1,6 +1,6 @@ { "apiName": "hackathon", - "resourcePath": "query/agent", + "resourcePath": "queryagent", "method": { "httpMethod": "POST", "authorizationType": "NONE", From 7d9b9c51f70861a197c13a27db23ce2842ca857e Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 01:37:56 +0530 Subject: [PATCH 094/114] database fix --- backend/web_api/database.py | 2 +- backend/web_api/web_api.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/web_api/database.py b/backend/web_api/database.py index 9a01499..4f7b9fa 100644 --- a/backend/web_api/database.py +++ b/backend/web_api/database.py @@ -6,7 +6,7 @@ DATABASE_URL = os.environ.get( "DATABASE_URL", "postgresql://your_user:your_password@your_aurora_endpoint/myappdb" -).replace("postgresql://", "postgresql+psycopg://") +).replace("postgresql://", "postgresql+psycopg2://") # The 'connect_args' is needed for SQLite, but not for PostgreSQL. # For PostgreSQL, you can remove it. diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index 2b8b072..ec95d27 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -130,7 +130,7 @@ def get_db_connection(): finally: conn.close() -@app.post("/query/agent", response_model=List[Dict[str, Any]]) +@app.post("/queryagent", response_model=List[Dict[str, Any]]) def query_with_bedrock_agent(query: NaturalLanguageQuery, conn=Depends(get_db_connection)): """ Takes a natural language question, sends it to a pre-configured Bedrock Agent, From b3e5efb60b929fb1d3619b2451f5a0c573c08e30 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 01:44:25 +0530 Subject: [PATCH 095/114] database fix --- backend/web_api/web_api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/web_api/web_api.py b/backend/web_api/web_api.py index ec95d27..4fcab05 100644 --- a/backend/web_api/web_api.py +++ b/backend/web_api/web_api.py @@ -5,8 +5,8 @@ from fastapi import FastAPI, Depends, HTTPException, Response from sqlmodel import Session, select -from backend.web_api.database import get_session -from backend.web_api.models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead +from database import get_session +from models import Articles, ArticleCreate, ArticleRead, Clusters, ClusterCreate, ClusterRead from typing import List, Dict, Any from contextlib import asynccontextmanager, closing @@ -23,7 +23,7 @@ # from bedrock_agent import generate_sql_from_prompt # namasthe # Import our new agent invoker function -from backend.web_api.bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql +from bedrock_agent_invoke import invoke_bedrock_agent_to_get_sql # Load environment variables from .env file from dotenv import load_dotenv From 47c2cdb50d26e6e10358143500d319d6882bde47 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sun, 29 Jun 2025 01:45:35 +0530 Subject: [PATCH 096/114] adding an api --- api/deploy-api.py | 37 ++++++++++++++++++++++++------------- api/query-test.json | 10 ++++++++++ 2 files changed, 34 insertions(+), 13 deletions(-) create mode 100644 api/query-test.json diff --git a/api/deploy-api.py b/api/deploy-api.py index 08298a0..5f0acc3 100644 --- a/api/deploy-api.py +++ b/api/deploy-api.py @@ -20,22 +20,33 @@ def get_or_create_api(api_name): response = apigateway.create_rest_api(name=api_name) return response["id"] -def get_or_create_resource(api_id, resource_path): +def get_or_create_resource(api_id, full_path): + # Normalize and split nested path: "users/{userId}" => ["users", "{userId}"] + parts = [p for p in full_path.strip("/").split("/") if p] resources = apigateway.get_resources(restApiId=api_id)["items"] - root_id = next(item["id"] for item in resources if item["path"] == "/") - for res in resources: - if res["path"] == f"/{resource_path}": - print(f"Found resource: /{resource_path}") - return res["id"] + # Build a path-to-id map + path_map = {res["path"]: res["id"] for res in resources} + parent_path = "" + parent_id = path_map["/"] # root path + + for part in parts: + current_path = f"{parent_path}/{part}" if parent_path else f"/{part}" + if current_path in path_map: + parent_id = path_map[current_path] + else: + print(f"Creating resource: {current_path}") + response = apigateway.create_resource( + restApiId=api_id, + parentId=parent_id, + pathPart=part + ) + parent_id = response["id"] + path_map[current_path] = parent_id + parent_path = current_path + + return parent_id - print(f"Creating resource: /{resource_path}") - response = apigateway.create_resource( - restApiId=api_id, - parentId=root_id, - pathPart=resource_path - ) - return response["id"] def method_exists(api_id, resource_id, http_method): try: diff --git a/api/query-test.json b/api/query-test.json new file mode 100644 index 0000000..22c5660 --- /dev/null +++ b/api/query-test.json @@ -0,0 +1,10 @@ +{ + "apiName": "hackathon", + "resourcePath": "test/query", + "method": { + "httpMethod": "POST", + "authorizationType": "NONE", + "lambdaFunctionName": "web_api" + }, + "deploy": true +} \ No newline at end of file From 55dd4014bd3a3c30687974ecbc4df56c0d6c151c Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 02:29:16 +0530 Subject: [PATCH 097/114] integrate relevance agent --- backend/raw_data_handler/raw_data_handler.py | 6 +++ backend/test_agent.py | 39 ++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 backend/test_agent.py diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index 8fc43ee..b9003ec 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -11,6 +11,7 @@ import re import boto3 import traceback +from test_agent import is_relevance BUCKET_NAME = 'awstraindata' role = 'arn:aws:iam::269854564686:role/hackathon-comprehend-role' @@ -32,6 +33,11 @@ def lambda_handler(event, context): print(f"Extracted {len(articles)} articles from part") for article in articles: print(f"Processing article: {article['Title']}") + # Check if article is relevant + is_relevant = is_relevance(article) + if not is_relevant: + print(f"Article {article['Title']} is not relevant, skipping") + continue output_csv = io.StringIO() writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) # writer.writeheader() diff --git a/backend/test_agent.py b/backend/test_agent.py new file mode 100644 index 0000000..f1db899 --- /dev/null +++ b/backend/test_agent.py @@ -0,0 +1,39 @@ +import re +import boto3 +import json + +def is_relevance(article): + # Initialize Bedrock Agent Runtime client + bedrock_agent = boto3.client("bedrock-agent-runtime", region_name="us-east-1") # replace with your region + + # Agent identifiers (get these from Bedrock console) + agent_id = "IKXDLL0K7W" + agent_alias_id = "DY9KWQNAGM" + + # Your dynamic user message (e.g., relationship analysis prompt) + user_input = article['Content'] + + # Call the agent + response = bedrock_agent.invoke_agent( + agentId=agent_id, + agentAliasId=agent_alias_id, + sessionId="news-analysis-session-001", + inputText=user_input + ) + is_relevant = False + # Read the response stream + for event in response["completion"]: + if "chunk" in event: + payload = event["chunk"]["bytes"] + chunk_str = payload.decode("utf-8") + match = re.search(r"\{.*\}", chunk_str, re.DOTALL) + if match: + json_block = match.group(0) + parsed_json = json.loads(json_block) + print("Parsed JSON:", parsed_json) + is_relevant = parsed_json.get("relevance_score", 0) > 0.5 + print("Is relevant:", is_relevant) + print(parsed_json.get("relevance_score", "No content found")) + else: + print("āŒ No JSON found in response") + return is_relevant From ce2b9f299095cee2cdf31ebc12e2699bad43f069 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 02:35:02 +0530 Subject: [PATCH 098/114] trial v1 --- backend/web_api/bedrock_agent_invoke.py | 83 +++++++++++++++++-------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/backend/web_api/bedrock_agent_invoke.py b/backend/web_api/bedrock_agent_invoke.py index fbe3bbb..30f44c2 100644 --- a/backend/web_api/bedrock_agent_invoke.py +++ b/backend/web_api/bedrock_agent_invoke.py @@ -43,41 +43,72 @@ def invoke_bedrock_agent_to_get_sql( event_stream = response['completion'] final_sql_query = None + all_events = [] - # The response is a stream of events. We need to parse it to find the - # 'observation' from the action group that contains the final SQL query. + # Collect all events for post-processing for event in event_stream: + all_events.append(event) if 'trace' in event: trace_part = event['trace']['trace'] if 'observation' in trace_part: observation = trace_part['observation'] - if 'actionGroupInvocationOutput' in observation: - output_str = observation['actionGroupInvocationOutput']['text'] - # The output is often a JSON string, we need to parse it - try: - output_json = json.loads(output_str) - # The key 'generatedQuery' might vary based on your - # Lambda function's return format for the action group. - # Inspect your agent's trace to find the correct key. - if 'generatedQuery' in output_json: - final_sql_query = output_json['generatedQuery'] - print(f"Extracted SQL from Agent trace: {final_sql_query}") - # We found the query, no need to process further - break - except json.JSONDecodeError: - print(f"Could not decode observation output: {output_str}") - + # If observation is a list, iterate through it + if isinstance(observation, list): + for obs in observation: + if 'finalResponse' in obs: + final_response = obs['finalResponse'] + text = final_response.get('text', '') + # Extract SQL from markdown code block if present + if '```sql' in text: + sql_start = text.find('```sql') + len('```sql') + sql_end = text.find('```', sql_start) + final_sql_query = text[sql_start:sql_end].strip() + else: + final_sql_query = text.strip() + print(f"Extracted SQL from finalResponse: {final_sql_query}") + break + # If observation is a dict, handle as before + elif isinstance(observation, dict): + if 'finalResponse' in observation: + final_response = observation['finalResponse'] + text = final_response.get('text', '') + if '```sql' in text: + sql_start = text.find('```sql') + len('```sql') + sql_end = text.find('```', sql_start) + final_sql_query = text[sql_start:sql_end].strip() + else: + final_sql_query = text.strip() + print(f"Extracted SQL from finalResponse: {final_sql_query}") + break + if 'actionGroupInvocationOutput' in observation: + output_str = observation['actionGroupInvocationOutput']['text'] + try: + output_json = json.loads(output_str) + if 'generatedQuery' in output_json: + final_sql_query = output_json['generatedQuery'] + print(f"Extracted SQL from Agent trace: {final_sql_query}") + break + except json.JSONDecodeError: + print(f"Could not decode observation output: {output_str}") + # Fallback: check for chunk events if not found if not final_sql_query: - # Fallback if the detailed trace isn't as expected, check final response - # Note: This part is less reliable for getting the raw SQL - for event in event_stream: + for event in all_events: if 'chunk' in event: - data = json.loads(event['chunk']['bytes'].decode()) - if data['type'] == 'finalResponse': - print("Warning: Could not find SQL in trace, final response text might not be a query.") - # This text is often a natural language answer, not the SQL itself - break + try: + data = json.loads(event['chunk']['bytes'].decode()) + if data.get('type') == 'finalResponse': + text = data.get('text', '') + if '```sql' in text: + sql_start = text.find('```sql') + len('```sql') + sql_end = text.find('```', sql_start) + final_sql_query = text[sql_start:sql_end].strip() + else: + final_sql_query = text.strip() + print(f"Extracted SQL from chunk finalResponse: {final_sql_query}") + break + except Exception as e: + print(f"Error decoding chunk: {e}") return final_sql_query From 3dc16d3211f81664a953b8019029f06559911238 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 02:35:39 +0530 Subject: [PATCH 099/114] debug the agent --- backend/test_agent.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/test_agent.py b/backend/test_agent.py index f1db899..787a817 100644 --- a/backend/test_agent.py +++ b/backend/test_agent.py @@ -22,12 +22,19 @@ def is_relevance(article): ) is_relevant = False # Read the response stream + print("Response from Bedrock Agent:") + print(response) for event in response["completion"]: + print("Event:", event) if "chunk" in event: + print("Processing chunk...") + print("Chunk ID:", event["chunk"]) payload = event["chunk"]["bytes"] chunk_str = payload.decode("utf-8") match = re.search(r"\{.*\}", chunk_str, re.DOTALL) if match: + print("Found JSON block in chunk") + print("JSON Block:", match.group(0)) json_block = match.group(0) parsed_json = json.loads(json_block) print("Parsed JSON:", parsed_json) From aac33ddad31bcb66339b919c88c4f807f3845993 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 02:44:04 +0530 Subject: [PATCH 100/114] trial v2 --- backend/web_api/bedrock_agent_invoke.py | 33 ++++++++++++++----------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/backend/web_api/bedrock_agent_invoke.py b/backend/web_api/bedrock_agent_invoke.py index 30f44c2..77e0a00 100644 --- a/backend/web_api/bedrock_agent_invoke.py +++ b/backend/web_api/bedrock_agent_invoke.py @@ -95,20 +95,25 @@ def invoke_bedrock_agent_to_get_sql( if not final_sql_query: for event in all_events: if 'chunk' in event: - try: - data = json.loads(event['chunk']['bytes'].decode()) - if data.get('type') == 'finalResponse': - text = data.get('text', '') - if '```sql' in text: - sql_start = text.find('```sql') + len('```sql') - sql_end = text.find('```', sql_start) - final_sql_query = text[sql_start:sql_end].strip() - else: - final_sql_query = text.strip() - print(f"Extracted SQL from chunk finalResponse: {final_sql_query}") - break - except Exception as e: - print(f"Error decoding chunk: {e}") + raw_bytes = event['chunk']['bytes'] + print(f"Raw chunk bytes: {raw_bytes!r}") + if raw_bytes: + try: + data = json.loads(raw_bytes.decode()) + if data.get('type') == 'finalResponse': + text = data.get('text', '') + if '```sql' in text: + sql_start = text.find('```sql') + len('```sql') + sql_end = text.find('```', sql_start) + final_sql_query = text[sql_start:sql_end].strip() + else: + final_sql_query = text.strip() + print(f"Extracted SQL from chunk finalResponse: {final_sql_query}") + break + except Exception as e: + print(f"Error decoding chunk: {e}") + else: + print("Chunk bytes are empty, skipping.") return final_sql_query From 793bcac77b50f2a3f362f6db7a791febde084fa4 Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 02:56:11 +0530 Subject: [PATCH 101/114] trial v3 --- backend/web_api/bedrock_agent_invoke.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/backend/web_api/bedrock_agent_invoke.py b/backend/web_api/bedrock_agent_invoke.py index 77e0a00..f3065cf 100644 --- a/backend/web_api/bedrock_agent_invoke.py +++ b/backend/web_api/bedrock_agent_invoke.py @@ -27,6 +27,28 @@ def invoke_bedrock_agent_to_get_sql( Returns: The generated SQL query string, or None if not found. + """ + prompt = f""" + convert the natural language query into a pgsql query. Return ONLY the sql command as an answer. + Here is the table schema: + + CREATE TABLE articles( + articles_id text, + title text, + body text, + "source" text, + published_date text, + entities text, + sentiment text + ); + + Here is the user's question: + + {question} + + while creating query ensure that it can ignore case sensitive. + Give me the stream response as json in an API call + """ try: # The invoke_agent API returns a streaming response. @@ -34,7 +56,7 @@ def invoke_bedrock_agent_to_get_sql( agentId=agent_id, agentAliasId=agent_alias_id, sessionId=session_id, - inputText=question, + inputText=prompt, streamingConfigurations = { "applyGuardrailInterval" : 20, "streamFinalResponse" : False From f4000981505123f8e3fb844ba6727fb55e562b5c Mon Sep 17 00:00:00 2001 From: kmangalpally Date: Sun, 29 Jun 2025 03:13:49 +0530 Subject: [PATCH 102/114] trial v4 --- backend/web_api/bedrock_agent_invoke.py | 121 +++++++++++++----------- 1 file changed, 67 insertions(+), 54 deletions(-) diff --git a/backend/web_api/bedrock_agent_invoke.py b/backend/web_api/bedrock_agent_invoke.py index f3065cf..7867aa1 100644 --- a/backend/web_api/bedrock_agent_invoke.py +++ b/backend/web_api/bedrock_agent_invoke.py @@ -68,58 +68,59 @@ def invoke_bedrock_agent_to_get_sql( all_events = [] # Collect all events for post-processing - for event in event_stream: - all_events.append(event) - if 'trace' in event: - trace_part = event['trace']['trace'] - if 'observation' in trace_part: - observation = trace_part['observation'] - # If observation is a list, iterate through it - if isinstance(observation, list): - for obs in observation: - if 'finalResponse' in obs: - final_response = obs['finalResponse'] - text = final_response.get('text', '') - # Extract SQL from markdown code block if present - if '```sql' in text: - sql_start = text.find('```sql') + len('```sql') - sql_end = text.find('```', sql_start) - final_sql_query = text[sql_start:sql_end].strip() - else: - final_sql_query = text.strip() - print(f"Extracted SQL from finalResponse: {final_sql_query}") - break - # If observation is a dict, handle as before - elif isinstance(observation, dict): - if 'finalResponse' in observation: - final_response = observation['finalResponse'] - text = final_response.get('text', '') - if '```sql' in text: - sql_start = text.find('```sql') + len('```sql') - sql_end = text.find('```', sql_start) - final_sql_query = text[sql_start:sql_end].strip() - else: - final_sql_query = text.strip() - print(f"Extracted SQL from finalResponse: {final_sql_query}") - break - if 'actionGroupInvocationOutput' in observation: - output_str = observation['actionGroupInvocationOutput']['text'] - try: - output_json = json.loads(output_str) - if 'generatedQuery' in output_json: - final_sql_query = output_json['generatedQuery'] - print(f"Extracted SQL from Agent trace: {final_sql_query}") - break - except json.JSONDecodeError: - print(f"Could not decode observation output: {output_str}") + # for event in event_stream: + # all_events.append(event) + # if 'trace' in event: + # trace_part = event['trace']['trace'] + # if 'observation' in trace_part: + # observation = trace_part['observation'] + # # If observation is a list, iterate through it + # if isinstance(observation, list): + # for obs in observation: + # if 'finalResponse' in obs: + # final_response = obs['finalResponse'] + # text = final_response.get('text', '') + # # Extract SQL from markdown code block if present + # if '```sql' in text: + # sql_start = text.find('```sql') + len('```sql') + # sql_end = text.find('```', sql_start) + # final_sql_query = text[sql_start:sql_end].strip() + # else: + # final_sql_query = text.strip() + # print(f"Extracted SQL from finalResponse: {final_sql_query}") + # break + # # If observation is a dict, handle as before + # elif isinstance(observation, dict): + # if 'finalResponse' in observation: + # final_response = observation['finalResponse'] + # text = final_response.get('text', '') + # if '```sql' in text: + # sql_start = text.find('```sql') + len('```sql') + # sql_end = text.find('```', sql_start) + # final_sql_query = text[sql_start:sql_end].strip() + # else: + # final_sql_query = text.strip() + # print(f"Extracted SQL from finalResponse: {final_sql_query}") + # break + # if 'actionGroupInvocationOutput' in observation: + # output_str = observation['actionGroupInvocationOutput']['text'] + # try: + # output_json = json.loads(output_str) + # if 'generatedQuery' in output_json: + # final_sql_query = output_json['generatedQuery'] + # print(f"Extracted SQL from Agent trace: {final_sql_query}") + # break + # except json.JSONDecodeError: + # print(f"Could not decode observation output: {output_str}") # Fallback: check for chunk events if not found - if not final_sql_query: - for event in all_events: - if 'chunk' in event: - raw_bytes = event['chunk']['bytes'] - print(f"Raw chunk bytes: {raw_bytes!r}") - if raw_bytes: + for event in all_events: + if 'chunk' in event: + raw_bytes = event['chunk']['bytes'] + print(f"Raw chunk bytes: {raw_bytes!r}") + if raw_bytes: + try: + # Try to decode as JSON, but if it fails, treat as plain text try: data = json.loads(raw_bytes.decode()) if data.get('type') == 'finalResponse': @@ -132,10 +133,22 @@ def invoke_bedrock_agent_to_get_sql( final_sql_query = text.strip() print(f"Extracted SQL from chunk finalResponse: {final_sql_query}") break - except Exception as e: - print(f"Error decoding chunk: {e}") - else: - print("Chunk bytes are empty, skipping.") + except json.JSONDecodeError: + # Not JSON, treat as plain text + text = raw_bytes.decode() + if '```sql' in text: + sql_start = text.find('```sql') + len('```sql') + sql_end = text.find('```', sql_start) + final_sql_query = text[sql_start:sql_end].strip() + else: + final_sql_query = text.strip() + print(f"Extracted SQL from plain text chunk: {final_sql_query}") + break + except Exception as e: + print(f"Error decoding chunk: {e}") + else: + print("Chunk bytes are empty, skipping.") + # if not final_sql_query: return final_sql_query From 77cdee685a203623445ef876bf1875ce5415647f Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 03:58:55 +0530 Subject: [PATCH 103/114] fail safe fail safe --- backend/test_agent.py | 47 ++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/backend/test_agent.py b/backend/test_agent.py index 787a817..8286cd8 100644 --- a/backend/test_agent.py +++ b/backend/test_agent.py @@ -1,4 +1,5 @@ import re +import traceback import boto3 import json @@ -22,25 +23,29 @@ def is_relevance(article): ) is_relevant = False # Read the response stream - print("Response from Bedrock Agent:") - print(response) - for event in response["completion"]: - print("Event:", event) - if "chunk" in event: - print("Processing chunk...") - print("Chunk ID:", event["chunk"]) - payload = event["chunk"]["bytes"] - chunk_str = payload.decode("utf-8") - match = re.search(r"\{.*\}", chunk_str, re.DOTALL) - if match: - print("Found JSON block in chunk") - print("JSON Block:", match.group(0)) - json_block = match.group(0) - parsed_json = json.loads(json_block) - print("Parsed JSON:", parsed_json) - is_relevant = parsed_json.get("relevance_score", 0) > 0.5 - print("Is relevant:", is_relevant) - print(parsed_json.get("relevance_score", "No content found")) - else: - print("āŒ No JSON found in response") + try: + print("Response from Bedrock Agent:") + print(response) + for event in response["completion"]: + print("Event:", event) + if "chunk" in event: + print("Processing chunk...") + print("Chunk ID:", event["chunk"]) + payload = event["chunk"]["bytes"] + chunk_str = payload.decode("utf-8") + match = re.search(r"\{.*\}", chunk_str, re.DOTALL) + if match: + print("Found JSON block in chunk") + print("JSON Block:", match.group(0)) + json_block = match.group(0) + parsed_json = json.loads(json_block) + print("Parsed JSON:", parsed_json) + is_relevant = parsed_json.get("relevance_score", 0) > 0.5 + print("Is relevant:", is_relevant) + print(parsed_json.get("relevance_score", "No content found")) + else: + print("āŒ No JSON found in response") + except Exception as e: + print("Error processing response:", e) + traceback.print_exc() return is_relevant From caca4f311ba819b0fb61a8768d0244a39c2314d9 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 04:07:22 +0530 Subject: [PATCH 104/114] suppress agent error --- backend/test_agent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/test_agent.py b/backend/test_agent.py index 8286cd8..bc6db1d 100644 --- a/backend/test_agent.py +++ b/backend/test_agent.py @@ -47,5 +47,6 @@ def is_relevance(article): print("āŒ No JSON found in response") except Exception as e: print("Error processing response:", e) - traceback.print_exc() + pass + # traceback.print_exc() return is_relevant From 49fdfb1c6c31964140fc8f6f4266ba2364f61220 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 04:17:47 +0530 Subject: [PATCH 105/114] subpress --- backend/test_agent.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/backend/test_agent.py b/backend/test_agent.py index bc6db1d..c09a98f 100644 --- a/backend/test_agent.py +++ b/backend/test_agent.py @@ -4,26 +4,28 @@ import json def is_relevance(article): + is_relevant = True + try: # Initialize Bedrock Agent Runtime client - bedrock_agent = boto3.client("bedrock-agent-runtime", region_name="us-east-1") # replace with your region - - # Agent identifiers (get these from Bedrock console) - agent_id = "IKXDLL0K7W" - agent_alias_id = "DY9KWQNAGM" + bedrock_agent = boto3.client("bedrock-agent-runtime", region_name="us-east-1") # replace with your region - # Your dynamic user message (e.g., relationship analysis prompt) - user_input = article['Content'] + # Agent identifiers (get these from Bedrock console) + agent_id = "IKXDLL0K7W" + agent_alias_id = "DY9KWQNAGM" + + # Your dynamic user message (e.g., relationship analysis prompt) + user_input = article['Content'] - # Call the agent - response = bedrock_agent.invoke_agent( - agentId=agent_id, - agentAliasId=agent_alias_id, - sessionId="news-analysis-session-001", - inputText=user_input - ) - is_relevant = False + # Call the agent + response = bedrock_agent.invoke_agent( + agentId=agent_id, + agentAliasId=agent_alias_id, + sessionId="news-analysis-session-001", + inputText=user_input + ) + # Read the response stream - try: + print("Response from Bedrock Agent:") print(response) for event in response["completion"]: @@ -46,7 +48,7 @@ def is_relevance(article): else: print("āŒ No JSON found in response") except Exception as e: - print("Error processing response:", e) - pass + print("Error processing response:", e) + pass # traceback.print_exc() return is_relevant From 500baf59505fceae0a67e7e1063eed4def266377 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 04:29:42 +0530 Subject: [PATCH 106/114] removed comments --- backend/raw_data_handler/raw_data_handler.py | 53 -------------------- 1 file changed, 53 deletions(-) diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index b9003ec..b40fae5 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -50,12 +50,6 @@ def lambda_handler(event, context): VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) # Upload to S3 print(f"Uploading CSV to S3: {csv_filename}") - # s3.put_object( - # Bucket=BUCKET_NAME, - # Key=csv_filename, - # Body=output_csv.getvalue(), - # ContentType='text/csv' - # ) conn.commit() get_data_inline(output_csv.getvalue(), article_id, article['Date'], comprehend, cursor, conn) cursor.close() @@ -124,29 +118,7 @@ def extract_articles(file_stream): def add_keyphrase_to_article(conn, cursor, article_id, article_date, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") - # cursor.execute("SELECT * FROM keyphrases WHERE phrases in %s", (tuple(entities_text),)) - # entity_db_array = cursor.fetchall() - # print(f"Entities in DB: {entity_db_array}") - # relevance_category = [] print(f"article_id: {article_id}") - # for entity in entities: - # print(f"Processing entity: {entity}") - # entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[2].lower() == entity['Text'].lower()] - # print(f"Entity in DB: {entity_in_db}") - # if not entity_in_db: - # cursor.execute("INSERT INTO keyphrases (start_datetime,phrases) VALUES (%s, %s) RETURNING id", (article_date, entity['Text'])) - # conn.commit() - # db_entity = cursor.fetchone() - # print(f"Inserted new entity: {db_entity}") - # relevance_category.append(db_entity[0]) - # else: - # print(f"Entity already exists in DB: {entity_in_db}") - # cursor.execute("""UPDATE keyphrases SET linkedtimelines = - # CASE - # WHEN linkedtimelines IS NULL OR linkedtimelines = '' THEN %s - # ELSE linkedtimelines || ', ' || %s - # END WHERE id = %s""", (article_date, article_date, entity_in_db[0][0])) - # relevance_category.append(entity_in_db[0][0]) if entities_text: relevance_category = ','.join(map(str, entities_text)) cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) @@ -154,34 +126,9 @@ def add_keyphrase_to_article(conn, cursor, article_id, article_date, entities): def add_entities_to_article(conn, cursor, article_id, entities): entities_text = [entity['Text'] for entity in entities] print(f"Entities to be added: {entities_text}") - # cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) - # entity_db_array = cursor.fetchall() - # print(f"Entities in DB: {entity_db_array}") location_mentions = [] officials_involved = [] - # relevance_category = [] print(f"article_id: {article_id}") - - # for entity in entities: - # print(f"Processing entity: {entity}") - # entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] - # print(f"Entity in DB: {entity_in_db}") - # if not entity_in_db: - # current_time = datetime.datetime.utcnow() - # cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) - # conn.commit() - # db_entity = cursor.fetchone() - # print(f"Inserted new entity: {db_entity}") - # if entity['Type'] == 'LOCATION': - # location_mentions.append(db_entity[0]) - # elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': - # officials_involved.append(db_entity[0]) - # else: - # print(f"Entity already exists in DB: {entity_in_db}") - # if entity['Type'] == 'LOCATION': - # location_mentions.append(entity_in_db[0][0]) - # elif entity['Type'] == 'PERSON' or entity['Type'] == 'ORGANIZATION': - # officials_involved.append(entity_in_db[0][0]) for entity in entities: if entity['Type'] == 'LOCATION': location_mentions.append(entity['Text'].lower()) From 3abbb96aa202fa942b5141f9e41ef4fb150c804a Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 05:53:45 +0530 Subject: [PATCH 107/114] cluster service --- backend/clustering_service/Utils.py | 25 ++ .../clustering_service/clustering_service.py | 398 +++++++++++++++++- 2 files changed, 417 insertions(+), 6 deletions(-) create mode 100644 backend/clustering_service/Utils.py diff --git a/backend/clustering_service/Utils.py b/backend/clustering_service/Utils.py new file mode 100644 index 0000000..41ed2b4 --- /dev/null +++ b/backend/clustering_service/Utils.py @@ -0,0 +1,25 @@ +import json +import psycopg2 +def get_postgresql_connection(): + '''get the creds from local config''' + + """ + Establish a connection to a PostgreSQL database. + + Parameters: + host (str): The hostname of the PostgreSQL server. + database (str): The name of the database to connect to. + user (str): The username to connect with. + password (str): The password for the user. + + Returns: + psycopg2.extensions.connection: A connection object to the PostgreSQL database. + """ + try: + with open("pg_config.json") as f: + config = json.load(f) + conn = psycopg2.connect(**config) + return conn + except psycopg2.Error as e: + print("Error connecting to PostgreSQL database:", e) + return None \ No newline at end of file diff --git a/backend/clustering_service/clustering_service.py b/backend/clustering_service/clustering_service.py index 76eda3b..25fb55f 100644 --- a/backend/clustering_service/clustering_service.py +++ b/backend/clustering_service/clustering_service.py @@ -1,8 +1,394 @@ +# šŸ“¦ News Articles Relation & Pattern Detection (Offline) + +# ---- 0. Install Required Packages ---- +# pip install sentence-transformers faiss-cpu bertopic hdbscan umap-learn transformers accelerate + +# ---- 1. Load and Preprocess Articles ---- +from Utils import get_postgresql_connection + + +articles = [ + {"title": "Minister Lokesh to Visit Parvathipuram for Student Felicitation and TDP Meet", "content": "Parvathipuram: Education Minister and TDP National General Secretary Nara Lokesh will visit Parvathipuram Manyam district on Monday. Party sources said that plans are being made to organize government and party programs simultaneously. On Monday morning, a felicitation program will be held at a private function hall under the auspices of the Education Department for students who have achieved the best results in class 10. 150 people will be given mementos and gifts. Later, a meeting will be held with party workers of Parvathipuram constituency. For this, MLA Vijayachandra and Ravi, a member of the tour inspection team, inspected the government junior college grounds and the private grounds in Venkampet. It has been decided to ensure that the venues of the two programs are as far apart as possible. Minister Lokesh's visit It seems that the Venkampet ground has almost been finalized following Lokesh's suggestion. Collector Shyam Prasad met with district officials and made suggestions on the arrangements for the minister's visit."}, + {"title": "Farmer Fatally Hit by Two-Wheeler While Returning Home in Palakonda", "content": "Palakonda: A small farmer who was making a living doing chores was on his way home from his farm when fate took a turn. He was hit by a two-wheeler while crossing the road. Going into the details given by SI K. Prayogamurthy... Garbhana Lalkas (62) of Gopalapuram village was returning home on Friday evening after completing his daily chores and doing some laundry. While crossing the road near the village, Yakula Raghu, who was riding a two-wheeler from Palakonda towards Rajam, hit Laxmum with his vehicle. The injured, who was seriously injured, was rushed to the Palakonda Area Hospital by the locals. Laxmum had already died. His wife Annapurna and three children wept bitterly upon learning about the incident. The SI said that a case has been registered in this incident."}, + {"title": "New Tariffs Imposed", "content": "The government imposed new tariffs on steel imports from Asia."}, + {"title": "Opposition Criticizes Trade Policy", "content": "Opposition leaders argue the trade deal will hurt local manufacturing."}, +] + + + +# šŸ“¦ Advanced News Articles Indexing & Pattern Detection +# Comprehensive solution with multiple algorithms for different use cases + +import numpy as np +from sentence_transformers import SentenceTransformer +import faiss +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +import networkx as nx +from collections import defaultdict +import pickle import json -def lambda_handler(event, context): - # TODO implement - return { - 'statusCode': 200, - 'body': json.dumps('Hello from Lambda!') - } \ No newline at end of file +class NewsArticleIndexer: + """ + Multi-algorithm news article indexer with various similarity detection methods + """ + + def __init__(self, embedding_model='all-MiniLM-L6-v2', use_gpu=False): + # Initialize embedding model + self.embedding_model = SentenceTransformer(embedding_model) + if use_gpu: + self.embedding_model = self.embedding_model.cuda() + + # Initialize various indexing structures + self.articles = [] + self.embeddings = None + self.faiss_index = None + self.tfidf_vectorizer = None + self.tfidf_matrix = None + self.graph = None + + def add_articles(self, articles): + """Add articles to the indexer""" + self.articles.extend(articles) + self._build_indices() + + def preprocess_text(self, article): + """Enhanced preprocessing for better relevance""" + # Combine title (weighted more heavily) and content + title_weight = 2 # Give title more importance + return f"{' '.join([article['title']] * title_weight)} {article.get('location_mention', '')} {article.get('officals_involved', '')} {article.get('relevance_category', '')}" + + def _build_indices(self): + """Build all indexing structures""" + texts = [self.preprocess_text(article) for article in self.articles] + + # 1. SEMANTIC EMBEDDINGS (Best for semantic similarity) + print("Building semantic embeddings...") + self.embeddings = self.embedding_model.encode(texts, convert_to_tensor=True) + + # 2. FAISS INDEX (Best for large-scale retrieval) + print("Building FAISS index...") + self._build_faiss_index() + + # 3. TF-IDF (Best for keyword-based similarity) + print("Building TF-IDF index...") + self._build_tfidf_index(texts) + + # 4. GRAPH-BASED (Best for discovering article networks) + print("Building article graph...") + self._build_article_graph() + + def _build_faiss_index(self): + """Build FAISS index for fast similarity search""" + d = self.embeddings.shape[1] + + # For small datasets: IndexFlatL2 (exact search) + # For large datasets: IndexIVFFlat (approximate search) + if len(self.articles) < 10000: + self.faiss_index = faiss.IndexFlatL2(d) + else: + # Use IVF for larger datasets + nlist = min(100, len(self.articles) // 10) # number of clusters + quantizer = faiss.IndexFlatL2(d) + self.faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist) + # Train the index + self.faiss_index.train(self.embeddings.cpu().numpy()) + + self.faiss_index.add(self.embeddings.cpu().numpy()) + + def _build_tfidf_index(self, texts): + """Build TF-IDF index for keyword-based similarity""" + self.tfidf_vectorizer = TfidfVectorizer( + max_features=5000, + stop_words='english', + ngram_range=(1, 2), # Include bigrams + min_df=1, + max_df=0.95 + ) + self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts) + + def _build_article_graph(self, similarity_threshold=0.3): + """Build graph of related articles""" + self.graph = nx.Graph() + + # Add all articles as nodes + for i, article in enumerate(self.articles): + self.graph.add_node(i, **article) + + # Add edges based on similarity + for i in range(len(self.articles)): + similar_articles = self.find_similar_semantic(i, k=5, return_scores=True) + for j, score in similar_articles: + if i != j and score > similarity_threshold: + self.graph.add_edge(i, j, weight=score) + + # ===== SIMILARITY SEARCH METHODS ===== + + def find_similar_semantic(self, query_idx, k=5, return_scores=False): + """Find similar articles using semantic embeddings (BEST for meaning)""" + query_embedding = self.embeddings[query_idx].cpu().numpy().reshape(1, -1) + + # Search using FAISS + distances, indices = self.faiss_index.search(query_embedding, k + 1) + + # Remove the query article itself + results = [] + for i, (dist, idx) in enumerate(zip(distances[0], indices[0])): + if idx != query_idx: + similarity_score = 1 / (1 + dist) # Convert distance to similarity + if return_scores: + results.append((idx, similarity_score)) + else: + results.append(idx) + if len(results) >= k: + break + + return results + + def find_similar_tfidf(self, query_idx, k=5, return_scores=False): + """Find similar articles using TF-IDF (BEST for keywords)""" + query_vector = self.tfidf_matrix[query_idx] + similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten() + + # Get top-k similar articles (excluding self) + similar_indices = similarities.argsort()[::-1] + results = [] + + for idx in similar_indices: + if idx != query_idx and len(results) < k: + if return_scores: + results.append((idx, similarities[idx])) + else: + results.append(idx) + + return results + + def find_similar_hybrid(self, query_idx, k=5, semantic_weight=0.7, return_scores=False): + """Hybrid approach combining semantic and TF-IDF (BEST overall)""" + # Get semantic similarities + semantic_results = self.find_similar_semantic(query_idx, k=k*2, return_scores=True) + tfidf_results = self.find_similar_tfidf(query_idx, k=k*2, return_scores=True) + + # Combine scores + combined_scores = defaultdict(float) + + for idx, score in semantic_results: + combined_scores[idx] += semantic_weight * score + + for idx, score in tfidf_results: + combined_scores[idx] += (1 - semantic_weight) * score + + # Sort by combined score + sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) + + if return_scores: + return sorted_results[:k] + else: + return [idx for idx, _ in sorted_results[:k]] + + def find_similar_graph(self, query_idx, k=5): + """Find similar articles using graph-based methods""" + if not self.graph.has_node(query_idx): + return [] + + # Get neighbors sorted by edge weight + neighbors = list(self.graph.neighbors(query_idx)) + neighbor_weights = [(n, self.graph[query_idx][n]['weight']) for n in neighbors] + neighbor_weights.sort(key=lambda x: x[1], reverse=True) + + return [idx for idx, _ in neighbor_weights[:k]] + + # ===== ADVANCED ANALYSIS METHODS ===== + + def detect_article_clusters(self, method='semantic', n_clusters=None): + """Detect clusters of related articles""" + from sklearn.cluster import KMeans, DBSCAN + + if method == 'semantic': + features = self.embeddings.cpu().numpy() + elif method == 'tfidf': + features = self.tfidf_matrix.toarray() + + if n_clusters: + clusterer = KMeans(n_clusters=n_clusters, random_state=42) + else: + clusterer = DBSCAN(eps=0.5, min_samples=2) + + cluster_labels = clusterer.fit_predict(features) + + # Group articles by cluster + clusters = defaultdict(list) + for i, label in enumerate(cluster_labels): + clusters[label].append(i) + + return dict(clusters) + + def find_trending_topics(self, time_window_hours=24): + """Find trending topics (if articles have timestamps)""" + # This would require timestamp information in articles + # Implementation would filter recent articles and cluster them + pass + + def get_article_importance_scores(self): + """Calculate importance scores using PageRank on article graph""" + if self.graph is None: + return {} + + pagerank_scores = nx.pagerank(self.graph, weight='weight') + return pagerank_scores + + def save_index(self, filepath): + """Save the entire index to disk""" + index_data = { + 'articles': self.articles, + 'embeddings': self.embeddings.cpu().numpy() if self.embeddings is not None else None, + 'tfidf_vectorizer': self.tfidf_vectorizer, + 'tfidf_matrix': self.tfidf_matrix, + 'graph': self.graph + } + + with open(filepath, 'wb') as f: + pickle.dump(index_data, f) + + # Save FAISS index separately + if self.faiss_index is not None: + faiss.write_index(self.faiss_index, f"{filepath}.faiss") + + def load_index(self, filepath): + """Load index from disk""" + with open(filepath, 'rb') as f: + index_data = pickle.load(f) + + self.articles = index_data['articles'] + self.embeddings = index_data['embeddings'] + self.tfidf_vectorizer = index_data['tfidf_vectorizer'] + self.tfidf_matrix = index_data['tfidf_matrix'] + self.graph = index_data['graph'] + + # Load FAISS index + try: + self.faiss_index = faiss.read_index(f"{filepath}.faiss") + except: + print("Could not load FAISS index, rebuilding...") + self._build_faiss_index() + +# ===== USAGE EXAMPLE ===== + +def main(): + # Sample articles + conn = get_postgresql_connection() + cursor = conn.cursor() + query = "SELECT * FROM articles order by article_id asc" + cursor.execute(query) + articles_db = cursor.fetchall() + articles = [{} for _ in range(len(articles_db))] + for i, article in enumerate(articles_db): + articles[i] = { + "article_id": article[0], + "title": article[1], + # "content": article[2], + "location_mention": article[5], + "officals_involved": article[6], + "relevance_category": article[7], + } + + # Initialize indexer + indexer = NewsArticleIndexer() + indexer.add_articles(articles) + + # Test different similarity methods + # "President Signs New Trade Deal" + for i, article in enumerate(articles): + query_idx = i + print(f"\nšŸ” i: '{i}'") + print(f"\nšŸ” Finding articles similar to: '{articles[query_idx]['title']}'") + linked_id = []; + print("\n1. 🧠 Semantic Similarity (Best for meaning):") + semantic_results = indexer.find_similar_semantic(query_idx, k=3) + for idx in semantic_results: + linked_id.append(articles[idx]['article_id']) + print(f" - {articles[idx]['title']}") + + # print("\n2. šŸ“ TF-IDF Similarity (Best for keywords):") + # tfidf_results = indexer.find_similar_tfidf(query_idx, k=3) + # for idx in tfidf_results: + # print(f" - {articles[idx]['title']}") + + print("\n3. Hybrid Similarity (Best overall):") + hybrid_results = indexer.find_similar_hybrid(query_idx, k=3) + for idx in hybrid_results: + print(f" - {articles[idx]['title']}") + cursor.execute("UPDATE articles SET linked_id = %s WHERE article_id = %s", (linked_id, articles[query_idx]['article_id'])) + conn.commit() + # print("\n4. Graph-based Similarity:") + # graph_results = indexer.find_similar_graph(query_idx, k=3) + # for idx in graph_results: + # print(f" - {articles[idx]['title']}") + + # # Detect clusters + # print("\n Article Clusters:") + # clusters = indexer.detect_article_clusters(method='semantic') + # for cluster_id, article_indices in clusters.items(): + # if cluster_id != -1: # Ignore noise cluster + # print(f"Cluster {cluster_id}:") + # for idx in article_indices: + # print(f" - {articles[idx]['title']}") + + # Calculate importance scores + # print("\n Article Importance Scores:") + # importance_scores = indexer.get_article_importance_scores() + # sorted_importance = sorted(importance_scores.items(), key=lambda x: x[1], reverse=True) + # for idx, score in sorted_importance[:3]: + # print(f" {score:.3f} - {articles[idx]['title']}") + +if __name__ == "__main__": + main() + +# ===== ALGORITHM RECOMMENDATIONS ===== + +""" +šŸŽÆ BEST ALGORITHMS FOR DIFFERENT USE CASES: + +1. **SMALL DATASETS (<10K articles)**: + - Index: FAISS IndexFlatL2 (exact search) + - Similarity: Hybrid (semantic + TF-IDF) + - Best for: High accuracy, all-purpose + +2. **LARGE DATASETS (>10K articles)**: + - Index: FAISS IndexIVFFlat or IndexHNSW + - Similarity: Semantic embeddings with FAISS + - Best for: Speed and scalability + +3. **KEYWORD-FOCUSED SEARCH**: + - Index: Elasticsearch/Solr or TF-IDF with sparse matrices + - Similarity: BM25 or TF-IDF cosine similarity + - Best for: Exact keyword matching + +4. **SEMANTIC UNDERSTANDING**: + - Index: Dense vector index (FAISS) + - Embeddings: sentence-transformers or OpenAI embeddings + - Best for: Understanding meaning and context + +5. **REAL-TIME UPDATES**: + - Index: FAISS with IndexIVFFlat + - Update strategy: Incremental indexing + - Best for: News feeds, live updates + +6. **TOPIC DISCOVERY**: + - Algorithm: BERTopic or LDA + - Features: Document embeddings or TF-IDF + - Best for: Finding emerging topics + +7. **ARTICLE NETWORKS**: + - Algorithm: Graph-based (NetworkX + PageRank) + - Features: Similarity-based edges + - Best for: Finding influential articles + +šŸš€ PERFORMANCE OPTIMIZATIONS: +- Use GPU acceleration for embeddings +- Implement caching for frequent queries +- Use approximate nearest neighbor search for large datasets +- Precompute similarity matrices for small datasets +- Implement incremental indexing for real-time updates +""" \ No newline at end of file From 53979e3ecbfec3d4512cf6ae4caccd6410038dc4 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 06:02:07 +0530 Subject: [PATCH 108/114] Create feed.py Create feed.py --- backend/get_feed/get_feed.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 backend/get_feed/get_feed.py diff --git a/backend/get_feed/get_feed.py b/backend/get_feed/get_feed.py new file mode 100644 index 0000000..75f6cdc --- /dev/null +++ b/backend/get_feed/get_feed.py @@ -0,0 +1,20 @@ + +from Utils import get_postgresql_connection + + +def lambda_handler(event, context): + conn = get_postgresql_connection() + cursor = conn.cursor() + query = "SELECT * FROM articles order by article_id asc" + cursor.execute(query) + articles = cursor.fetchall() + return { + "statusCode": 200, + "headers": { + "Content-Type": "application/json" + }, + "body": json.dumps({ + "status": "success", + "data": articles + }) + } \ No newline at end of file From 113c72e9faf0d3feedee044a2bd5ccb11d33bab7 Mon Sep 17 00:00:00 2001 From: hackathon Date: Sun, 29 Jun 2025 06:05:30 +0530 Subject: [PATCH 109/114] adding api --- api/get_feed.json | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 api/get_feed.json diff --git a/api/get_feed.json b/api/get_feed.json new file mode 100644 index 0000000..52f75ce --- /dev/null +++ b/api/get_feed.json @@ -0,0 +1,10 @@ +{ + "apiName": "hackathon", + "resourcePath": "getfeed", + "method": { + "httpMethod": "GET", + "authorizationType": "NONE", + "lambdaFunctionName": "get_feed" + }, + "deploy": true +} \ No newline at end of file From 7bd0203cb5194b3fe835261efed85bdfc9732473 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 06:11:17 +0530 Subject: [PATCH 110/114] api fix --- backend/Submit/Data_parser.py | 42 ---- backend/Submit/Submit copy.py | 122 ----------- .../clustering_service/clustering_service.py | 56 +---- backend/get_feed/get_feed.py | 1 + .../raw_data_handler/raw_data_handler copy.py | 198 ------------------ backend/raw_data_handler/raw_data_handler.py | 6 + 6 files changed, 11 insertions(+), 414 deletions(-) delete mode 100644 backend/Submit/Data_parser.py delete mode 100644 backend/Submit/Submit copy.py delete mode 100644 backend/raw_data_handler/raw_data_handler copy.py diff --git a/backend/Submit/Data_parser.py b/backend/Submit/Data_parser.py deleted file mode 100644 index 0fa971f..0000000 --- a/backend/Submit/Data_parser.py +++ /dev/null @@ -1,42 +0,0 @@ -import re -import csv -from docx import Document - -# Step 1: Read DOCX text -def read_docx_text(file_path): - doc = Document(file_path) - return "\n".join(p.text for p in doc.paragraphs) - -# Step 2: Extract articles using regex -def extract_articles(text): - pattern = re.compile( - r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', - re.DOTALL - ) - matches = pattern.findall(text) - articles = [] - for match in matches: - title = match[0].strip() - source = match[1].strip() - date_parts = match[2].strip().split("\n", 1) - date = date_parts[0].strip() - content = date_parts[1].strip() if len(date_parts) > 1 else "" - articles.append([title, source, date, content]) - return articles - -# Step 3: Save to CSV -def save_to_csv(articles, csv_path): - with open(csv_path, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f) - writer.writerow(["Title", "Source", "Date", "Body"]) - writer.writerows(articles) - -# Usage -docx_file = "D:\\Hackathon\\aihack-uc8\\backend\\MixedArticles _SameArticle_DiffSources.docx" # your input DOCX file -csv_file = "articles_parsed.csv" # output CSV file - -text = read_docx_text(docx_file) -articles = extract_articles(text) -save_to_csv(articles, csv_file) - -print(f"āœ… Extracted {len(articles)} articles to {csv_file}") diff --git a/backend/Submit/Submit copy.py b/backend/Submit/Submit copy.py deleted file mode 100644 index 95bf690..0000000 --- a/backend/Submit/Submit copy.py +++ /dev/null @@ -1,122 +0,0 @@ -import base64 -import json -from requests_toolbelt.multipart import decoder -import uuid -from Utils import get_postgresql_connection -from fastapi import FastAPI -from docx import Document -import csv -import io -import re -import boto3 -import traceback - -app = FastAPI() - -s3 = boto3.client('s3') -BUCKET_NAME = 'awstraindata' - -def lambda_handler(event, context): - try: - # Decode base64-encoded body (API Gateway encodes binary automatically) - print(f"Received event") - if event.get("isBase64Encoded", False): - body = base64.b64decode(event['body']) - else: - body = event['body'].encode("utf-8") - print(f"Decoded body length: {len(body)} bytes") - # Get content-type header - content_type = event['headers'].get('Content-Type') or event['headers'].get('content-type') - if not content_type: - return {"statusCode": 400, "body": "Missing Content-Type header"} - - # Parse multipart form - multipart_data = decoder.MultipartDecoder(body, content_type) - print(f"Multipart data parts: {len(multipart_data.parts)}") - s3_urls = [] - conn = get_postgresql_connection() - cursor = conn.cursor() - for part in multipart_data.parts: - print(f"Processing part: {part.headers.get(b'Content-Disposition')}") - filename = part.headers.get(b'Content-Disposition').decode().split('filename="')[1].split('"')[0] - file_stream = io.BytesIO(part.content) - file_stream.seek(0) - file_id = str(uuid.uuid4()) - s3_key = f"raw_data/{file_id}-{filename}" - # Upload to S3 - s3.put_object( - Bucket=BUCKET_NAME, - Key=s3_key, - Body=file_stream, - ContentType='application/vnd.openxmlformats-officedocument.wordprocessingml.document' - ) - # Extract file name from content-disposition - articles = extract_articles(io.BytesIO(part.content)) - print(f"Extracted {len(articles)} articles from part") - for article in articles: - print(f"Processing article: {article['Title']}") - output_csv = io.StringIO() - writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) - writer.writeheader() - writer.writerow(article) - article_id = str(uuid.uuid4()) - # Generate unique filename - csv_filename = f"input/articles-{article_id}.csv" - cursor.execute(""" - INSERT INTO articles (article_id, title, body, source, published_date) - VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) - # Upload to S3 - print(f"Uploading CSV to S3: {csv_filename}") - s3.put_object( - Bucket=BUCKET_NAME, - Key=csv_filename, - Body=output_csv.getvalue(), - ContentType='text/csv' - ) - conn.commit() - print(f"Uploaded CSV to S3: {csv_filename}") - s3_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{csv_filename}" - s3_urls.append(s3_url) - cursor.close() - conn.close() - return { - "statusCode": 200, - "headers": { - "Content-Type": "application/json" - }, - "body": json.dumps({ - "status": "success", - "s3_urls": s3_urls - }) - } - except Exception as e: - traceback.print_exc() - return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} - -def extract_articles(file_stream): - print(f"Extracting articles from file stream") - doc = Document(file_stream) - print(f"Document loaded with {len(doc.paragraphs)} paragraphs") - text = "\n".join(p.text for p in doc.paragraphs) - pattern = re.compile( - r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', - re.DOTALL - ) - matches = pattern.findall(text) - print(f"Found {len(matches)} matches in the document") - articles = [] - for match in matches: - print(f"Processing match: {match}") - title = match[0].strip() - source = match[1].strip() - date_parts = match[2].strip().split("\n", 1) - date = date_parts[0].strip() - content = date_parts[1].strip() if len(date_parts) > 1 else "" - print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") - articles.append({ - "Title": title, - "Source": source, - "Date": date, - "Content": content - }) - return articles \ No newline at end of file diff --git a/backend/clustering_service/clustering_service.py b/backend/clustering_service/clustering_service.py index 25fb55f..1b94175 100644 --- a/backend/clustering_service/clustering_service.py +++ b/backend/clustering_service/clustering_service.py @@ -303,14 +303,14 @@ def main(): query_idx = i print(f"\nšŸ” i: '{i}'") print(f"\nšŸ” Finding articles similar to: '{articles[query_idx]['title']}'") - linked_id = []; - print("\n1. 🧠 Semantic Similarity (Best for meaning):") + linked_id = [] + print("\n1. Semantic Similarity (Best for meaning):") semantic_results = indexer.find_similar_semantic(query_idx, k=3) for idx in semantic_results: linked_id.append(articles[idx]['article_id']) print(f" - {articles[idx]['title']}") - # print("\n2. šŸ“ TF-IDF Similarity (Best for keywords):") + # print("\n2. TF-IDF Similarity (Best for keywords):") # tfidf_results = indexer.find_similar_tfidf(query_idx, k=3) # for idx in tfidf_results: # print(f" - {articles[idx]['title']}") @@ -343,52 +343,4 @@ def main(): # print(f" {score:.3f} - {articles[idx]['title']}") if __name__ == "__main__": - main() - -# ===== ALGORITHM RECOMMENDATIONS ===== - -""" -šŸŽÆ BEST ALGORITHMS FOR DIFFERENT USE CASES: - -1. **SMALL DATASETS (<10K articles)**: - - Index: FAISS IndexFlatL2 (exact search) - - Similarity: Hybrid (semantic + TF-IDF) - - Best for: High accuracy, all-purpose - -2. **LARGE DATASETS (>10K articles)**: - - Index: FAISS IndexIVFFlat or IndexHNSW - - Similarity: Semantic embeddings with FAISS - - Best for: Speed and scalability - -3. **KEYWORD-FOCUSED SEARCH**: - - Index: Elasticsearch/Solr or TF-IDF with sparse matrices - - Similarity: BM25 or TF-IDF cosine similarity - - Best for: Exact keyword matching - -4. **SEMANTIC UNDERSTANDING**: - - Index: Dense vector index (FAISS) - - Embeddings: sentence-transformers or OpenAI embeddings - - Best for: Understanding meaning and context - -5. **REAL-TIME UPDATES**: - - Index: FAISS with IndexIVFFlat - - Update strategy: Incremental indexing - - Best for: News feeds, live updates - -6. **TOPIC DISCOVERY**: - - Algorithm: BERTopic or LDA - - Features: Document embeddings or TF-IDF - - Best for: Finding emerging topics - -7. **ARTICLE NETWORKS**: - - Algorithm: Graph-based (NetworkX + PageRank) - - Features: Similarity-based edges - - Best for: Finding influential articles - -šŸš€ PERFORMANCE OPTIMIZATIONS: -- Use GPU acceleration for embeddings -- Implement caching for frequent queries -- Use approximate nearest neighbor search for large datasets -- Precompute similarity matrices for small datasets -- Implement incremental indexing for real-time updates -""" \ No newline at end of file + main() \ No newline at end of file diff --git a/backend/get_feed/get_feed.py b/backend/get_feed/get_feed.py index 75f6cdc..cb4208d 100644 --- a/backend/get_feed/get_feed.py +++ b/backend/get_feed/get_feed.py @@ -1,4 +1,5 @@ +import json from Utils import get_postgresql_connection diff --git a/backend/raw_data_handler/raw_data_handler copy.py b/backend/raw_data_handler/raw_data_handler copy.py deleted file mode 100644 index e235d6f..0000000 --- a/backend/raw_data_handler/raw_data_handler copy.py +++ /dev/null @@ -1,198 +0,0 @@ -import base64 -import json -import time -import uuid -from Utils import get_postgresql_connection -from fastapi import FastAPI -from docx import Document -import csv -import io -import re -import boto3 -import traceback - -BUCKET_NAME = 'awstraindata' -role = 'arn:aws:iam::269854564686:role/hackathon-comprehend-role' -def lambda_handler(event, context): - try: - conn = get_postgresql_connection() - cursor = conn.cursor() - comprehend = boto3.client('comprehend', region_name='us-east-1') - for record in event['Records']: - print(f"New record: {record}") - bucket = record['s3']['bucket']['name'] - key = record['s3']['object']['key'] - print(f"Processing file from bucket: {bucket}, key: {key}") - s3 = boto3.client('s3') - print(f"Connecting to S3 bucket: {bucket}") - obj = s3.get_object(Bucket=bucket, Key=key) - stream = io.BytesIO(obj['Body'].read()) - articles = extract_articles(stream) - print(f"Extracted {len(articles)} articles from part") - for article in articles: - print(f"Processing article: {article['Title']}") - output_csv = io.StringIO() - writer = csv.DictWriter(output_csv, fieldnames=["Title", "Source", "Date", "Content"]) - # writer.writeheader() - writer.writerow(article) - article_id = str(uuid.uuid4()) - # Generate unique filename - csv_filename = f"input/articles-{article_id}.csv" - cursor.execute(""" - INSERT INTO articles (article_id, title, body, source, published_date) - VALUES (%s, %s, %s, %s, %s)""", (article_id, article['Title'], article['Content'], article['Source'], article['Date'])) - # Upload to S3 - print(f"Uploading CSV to S3: {csv_filename}") - s3.put_object( - Bucket=BUCKET_NAME, - Key=csv_filename, - Body=output_csv.getvalue(), - ContentType='text/csv' - ) - conn.commit() - start_jobs(f's3://{BUCKET_NAME}/{csv_filename}', article_id, comprehend, role, cursor, conn) - cursor.close() - conn.close() - except Exception as e: - traceback.print_exc() - print(f"Error processing event: {e}") -def get_data_inline(data, articles_id, comprehend, role_arn, cursor, conn): - entities_response = comprehend.detect_entities( - Text=data['Content'], - DataAccessRoleArn=role_arn, - LanguageCode='en' - ) - add_entities_to_article(conn, cursor, articles_id, entities_response['Entities']) - for entity in entities_response['Entities']: - print(f"Found entity: {entity['Text']} (Type: {entity['Type']})") - -def start_jobs(s3_uri, articles_id, comprehend, role_arn, cursor, conn): - print(f"Starting jobs for article ID: {articles_id}") - entities_job = comprehend.start_entities_detection_job( - InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MyEntityDetectionJob_'+ articles_id + '_' + str(int(time.time())) - ) - print(f"Entities job started: {entities_job['JobId']}") - result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) - print(f"Entities job description: {result}") - entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] - - # SENTIMENT detection job - sentiment_job = comprehend.start_sentiment_detection_job( - InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MySentimentDetectionJob_' + articles_id + '_' + str(int(time.time())) - ) - print(f"Sentiment job started: {sentiment_job['JobId']}") - res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) - print(f"Sentiment job description: {res}") - sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] - - # KEY PHRASES detection job - phrases_job = comprehend.start_key_phrases_detection_job( - InputDataConfig={'S3Uri': s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MyKeyPhrasesDetectionJob_' + articles_id + '_' + str(int(time.time())) - ) - print(f"Key Phrases job started: {phrases_job['JobId']}") - res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) - print(f"Key Phrases job description: {res}") - key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] - print("Entities Job Response:", entities_output) - print("Sentiment Job Response:", sentiment_output) - print("Key Phrases Job Response:", key_phrases_output) - print("Inserting into comprehend_jobs table") - cursor.execute(""" - INSERT INTO comprehend_jobs (article_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) - VALUES (%s, %s, %s, %s, %s)""", (articles_id, s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) - conn.commit() - -def extract_articles(file_stream): - print(f"Extracting articles from file stream") - doc = Document(file_stream) - print(f"Document loaded with {len(doc.paragraphs)} paragraphs") - text = "\n".join(p.text for p in doc.paragraphs) - pattern = re.compile( - r'Title:\s*(.*?)\s*Source:\s*(.*?)\s*Date:\s*(.*?)\s*(?=(?:\d{1,2}\)|Title:)|\Z)', - re.DOTALL - ) - matches = pattern.findall(text) - print(f"Found {len(matches)} matches in the document") - articles = [] - for match in matches: - print(f"Processing match: {match}") - title = match[0].strip() - source = match[1].strip() - date_parts = match[2].strip().split("\n", 1) - date = date_parts[0].strip() - content = date_parts[1].strip() if len(date_parts) > 1 else "" - print(f"Extracted article - Title: {title}, Source: {source}, Date: {date}, Content length: {len(content)}") - articles.append({ - "Title": title, - "Source": source, - "Date": date, - "Content": content - }) - return articles - - -def add_entities_to_article(conn, cursor, article_id, entities): - entities_text = [entity['Text'] for entity in entities] - print(f"Entities to be added: {entities_text}") - cursor.execute("SELECT * FROM entities WHERE entity in %s", (tuple(entities_text),)) - entity_db_array = cursor.fetchall() - print(f"Entities in DB: {entity_db_array}") - location_mentions = [] - officials_involved = [] - relevance_category = [] - print(f"article_id: {article_id}") - - print(f"Relevance category: {relevance_category}") - for entity in entities: - print(f"Processing entity: {entity}") - entity_in_db = [db_entity for db_entity in entity_db_array if db_entity[3].lower() == entity['Text'].lower()] - print(f"Entity in DB: {entity_in_db}") - if not entity_in_db: - current_time = datetime.datetime.utcnow() - cursor.execute("INSERT INTO entities (create_time,entity,type) VALUES (%s, %s, %s) RETURNING id", (current_time, entity['Text'], entity['Type'])) - conn.commit() - db_entity = cursor.fetchone() - print(f"Inserted new entity: {db_entity}") - if entity['Type'] == 'LOCATION': - location_mentions.append(db_entity[0]) - elif entity['Type'] == 'PERSON': - officials_involved.append(db_entity[0]) - else: - relevance_category.append(db_entity[0]) - else: - print(f"Entity already exists in DB: {entity_in_db}") - if entity['Type'] == 'LOCATION': - location_mentions.append(entity_in_db[0][0]) - elif entity['Type'] == 'PERSON': - officials_involved.append(entity_in_db[0][0]) - else: - relevance_category.append(entity_in_db[0][0]) - if location_mentions: - location_mentions = ','.join(map(str, location_mentions)) - cursor.execute("""update articles set location_mentions = %s where article_id = %s""", (location_mentions, article_id)) - - if officials_involved: - officials_involved = ','.join(map(str, officials_involved)) - cursor.execute("""update articles set officials_involved = %s where article_id = %s""", (officials_involved, article_id)) - - if relevance_category: - cursor.execute("SELECT relevance_category FROM articles WHERE article_id = %s", (article_id,)) - existing = cursor.fetchone() - relevance_category = ','.join(map(str, relevance_category)) - if existing[0] is not None: - print(f"Existing relevance category: {existing[0]}") - relevance_category = relevance_category + ',' + existing[0] - cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) - cursor.execute("""update articles set relevance_category = %s where article_id = %s""", (relevance_category, article_id)) \ No newline at end of file diff --git a/backend/raw_data_handler/raw_data_handler.py b/backend/raw_data_handler/raw_data_handler.py index b40fae5..9c13cb3 100644 --- a/backend/raw_data_handler/raw_data_handler.py +++ b/backend/raw_data_handler/raw_data_handler.py @@ -54,6 +54,12 @@ def lambda_handler(event, context): get_data_inline(output_csv.getvalue(), article_id, article['Date'], comprehend, cursor, conn) cursor.close() conn.close() + lambda_client = boto3.client('lambda') + response = lambda_client.invoke( + FunctionName='clustering_service', + InvocationType='Event' + ) + print(f"Second Lambda function invoked: {response}") except Exception as e: traceback.print_exc() print(f"Error processing event: {e}") From 92c6a9b112bee0119127f485b66ddd62f5c58163 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 06:15:07 +0530 Subject: [PATCH 111/114] removed unused code --- .../clustering_service/clustering_service.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/backend/clustering_service/clustering_service.py b/backend/clustering_service/clustering_service.py index 1b94175..e9ac3de 100644 --- a/backend/clustering_service/clustering_service.py +++ b/backend/clustering_service/clustering_service.py @@ -1,24 +1,5 @@ -# šŸ“¦ News Articles Relation & Pattern Detection (Offline) - -# ---- 0. Install Required Packages ---- -# pip install sentence-transformers faiss-cpu bertopic hdbscan umap-learn transformers accelerate - -# ---- 1. Load and Preprocess Articles ---- from Utils import get_postgresql_connection - -articles = [ - {"title": "Minister Lokesh to Visit Parvathipuram for Student Felicitation and TDP Meet", "content": "Parvathipuram: Education Minister and TDP National General Secretary Nara Lokesh will visit Parvathipuram Manyam district on Monday. Party sources said that plans are being made to organize government and party programs simultaneously. On Monday morning, a felicitation program will be held at a private function hall under the auspices of the Education Department for students who have achieved the best results in class 10. 150 people will be given mementos and gifts. Later, a meeting will be held with party workers of Parvathipuram constituency. For this, MLA Vijayachandra and Ravi, a member of the tour inspection team, inspected the government junior college grounds and the private grounds in Venkampet. It has been decided to ensure that the venues of the two programs are as far apart as possible. Minister Lokesh's visit It seems that the Venkampet ground has almost been finalized following Lokesh's suggestion. Collector Shyam Prasad met with district officials and made suggestions on the arrangements for the minister's visit."}, - {"title": "Farmer Fatally Hit by Two-Wheeler While Returning Home in Palakonda", "content": "Palakonda: A small farmer who was making a living doing chores was on his way home from his farm when fate took a turn. He was hit by a two-wheeler while crossing the road. Going into the details given by SI K. Prayogamurthy... Garbhana Lalkas (62) of Gopalapuram village was returning home on Friday evening after completing his daily chores and doing some laundry. While crossing the road near the village, Yakula Raghu, who was riding a two-wheeler from Palakonda towards Rajam, hit Laxmum with his vehicle. The injured, who was seriously injured, was rushed to the Palakonda Area Hospital by the locals. Laxmum had already died. His wife Annapurna and three children wept bitterly upon learning about the incident. The SI said that a case has been registered in this incident."}, - {"title": "New Tariffs Imposed", "content": "The government imposed new tariffs on steel imports from Asia."}, - {"title": "Opposition Criticizes Trade Policy", "content": "Opposition leaders argue the trade deal will hurt local manufacturing."}, -] - - - -# šŸ“¦ Advanced News Articles Indexing & Pattern Detection -# Comprehensive solution with multiple algorithms for different use cases - import numpy as np from sentence_transformers import SentenceTransformer import faiss From 8eeae7960348e23c5d6b2c43283d7db6e5e7ac04 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 06:17:52 +0530 Subject: [PATCH 112/114] update response --- backend/Submit/Submit.py | 2 +- backend/get_feed/get_feed.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/backend/Submit/Submit.py b/backend/Submit/Submit.py index 74a5ca1..31d88a0 100644 --- a/backend/Submit/Submit.py +++ b/backend/Submit/Submit.py @@ -59,4 +59,4 @@ def lambda_handler(event, context): } except Exception as e: traceback.print_exc() - return {"statusCode": 500, "body": f"āŒ Error: {str(e)}"} \ No newline at end of file + return {"statusCode": 500, "body": f"Error: {str(e)}"} \ No newline at end of file diff --git a/backend/get_feed/get_feed.py b/backend/get_feed/get_feed.py index cb4208d..b6df35c 100644 --- a/backend/get_feed/get_feed.py +++ b/backend/get_feed/get_feed.py @@ -14,8 +14,5 @@ def lambda_handler(event, context): "headers": { "Content-Type": "application/json" }, - "body": json.dumps({ - "status": "success", - "data": articles - }) + "body": json.dumps(articles) } \ No newline at end of file From 603013191fbfa2906d1aa935f4b6fe26e8faa165 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 06:19:45 +0530 Subject: [PATCH 113/114] get feed response changes --- backend/get_feed/get_feed.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/get_feed/get_feed.py b/backend/get_feed/get_feed.py index b6df35c..7616578 100644 --- a/backend/get_feed/get_feed.py +++ b/backend/get_feed/get_feed.py @@ -8,11 +8,13 @@ def lambda_handler(event, context): cursor = conn.cursor() query = "SELECT * FROM articles order by article_id asc" cursor.execute(query) - articles = cursor.fetchall() + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + result = [dict(zip(columns, row)) for row in rows] return { "statusCode": 200, "headers": { "Content-Type": "application/json" }, - "body": json.dumps(articles) + "body": json.dumps(result) } \ No newline at end of file From 30cacb92fe986f0bac8cd43dc8814dd72ff09a13 Mon Sep 17 00:00:00 2001 From: Shiva Teja Konduri Date: Sun, 29 Jun 2025 06:24:34 +0530 Subject: [PATCH 114/114] Removed unused files --- backend/Backup/Submit.py | 92 ------------------- backend/Backup/Transform.py | 64 ------------- backend/cluster-prioritization/Utils.py | 25 ----- .../cluster-prioritization.py | 44 --------- backend/clustering_service/Utils.py | 25 ----- backend/input_handler/Utils.py | 25 ----- backend/output_handler/Utils.py | 25 ----- 7 files changed, 300 deletions(-) delete mode 100644 backend/Backup/Submit.py delete mode 100644 backend/Backup/Transform.py delete mode 100644 backend/cluster-prioritization/Utils.py delete mode 100644 backend/cluster-prioritization/cluster-prioritization.py delete mode 100644 backend/clustering_service/Utils.py delete mode 100644 backend/input_handler/Utils.py delete mode 100644 backend/output_handler/Utils.py diff --git a/backend/Backup/Submit.py b/backend/Backup/Submit.py deleted file mode 100644 index 2620638..0000000 --- a/backend/Backup/Submit.py +++ /dev/null @@ -1,92 +0,0 @@ -import io -import pandas as pd -import boto3 -import time -import uuid -from Utils import get_postgresql_connection - -comprehend = boto3.client('comprehend') - -input_s3_uri = 's3://awstraindata/input.csv' -role_arn = 'arn:aws:iam::269854564686:role/hackathon-comprehend-role' - -s3 = boto3.client('s3') -# Download the file object -input_csv_object = s3.get_object(Bucket='awstraindata', Key='input.csv') - -# Read CSV into DataFrame -conn = get_postgresql_connection() -cursor = conn.cursor() -cursor.execute("drop table if exists articles") -cursor.execute("""CREATE TABLE IF NOT EXISTS articles ( - articles_id TEXT, - title TEXT, - body TEXT, - source TEXT, - published_date TEXT, - location_mentions TEXT, - officials_involved TEXT, - relevance_category TEXT, - sentiment TEXT - )""") -input_csv = pd.read_csv(io.BytesIO(input_csv_object['Body'].read())) -for index, row in input_csv.iterrows(): - print(f"Processing row {index}: {row}") - cursor.execute(""" - INSERT INTO articles (articles_id, title, body, source, published_date) - VALUES (%s, %s, %s, %s, %s)""", (row[0], row[1], row[2], row[3], row[4])) -conn.commit() -cursor.close() -entities_job = comprehend.start_entities_detection_job( - InputDataConfig={'S3Uri': input_s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/entities/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MyEntityDetectionJob_' + str(int(time.time())), -) -result = comprehend.describe_entities_detection_job(JobId=entities_job['JobId']) -entities_output = result['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri'] - -# SENTIMENT detection job -sentiment_job = comprehend.start_sentiment_detection_job( - InputDataConfig={'S3Uri': input_s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/sentiment/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MySentimentDetectionJob_' + str(int(time.time())), -) -res = comprehend.describe_sentiment_detection_job(JobId=sentiment_job['JobId']) -sentiment_output = res['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri'] - -# KEY PHRASES detection job -phrases_job = comprehend.start_key_phrases_detection_job( - InputDataConfig={'S3Uri': input_s3_uri, 'InputFormat': 'ONE_DOC_PER_LINE'}, - OutputDataConfig={'S3Uri': 's3://awstraindata/output/keyphrases/'}, - DataAccessRoleArn=role_arn, - LanguageCode='en', - JobName='MyKeyPhrasesDetectionJob_' + str(int(time.time())), -) -res = comprehend.describe_key_phrases_detection_job(JobId=phrases_job['JobId']) -key_phrases_output = res['KeyPhrasesDetectionJobProperties']['OutputDataConfig']['S3Uri'] -print("Entities Job Response:", entities_job) -print("Sentiment Job Response:", sentiment_job) -print("Key Phrases Job Response:", phrases_job) -conn = get_postgresql_connection() -if conn: - cursor = conn.cursor() - cursor.execute(""" - CREATE TABLE IF NOT EXISTS comprehend_jobs ( - batch_id TEXT, - input_s3_uri TEXT, - entities_job JSONB, - sentiment_job JSONB, - key_phrases_job JSONB - ) - """) - cursor.execute(""" - INSERT INTO comprehend_jobs (batch_id, input_s3_uri, entities_path, sentiment_path, key_phrases_path) - VALUES (%s, %s, %s, %s, %s)""", (str(uuid.uuid4()), input_s3_uri, entities_output.replace('s3://awstraindata/', ''), sentiment_output.replace('s3://awstraindata/', ''), key_phrases_output.replace('s3://awstraindata/', ''))) - conn.commit() - cursor.close() - conn.close() - diff --git a/backend/Backup/Transform.py b/backend/Backup/Transform.py deleted file mode 100644 index 2db246e..0000000 --- a/backend/Backup/Transform.py +++ /dev/null @@ -1,64 +0,0 @@ -from turtle import pd -import boto3 -import tarfile -import json -import psycopg2 -import io - -from EntityRecog.Utils import get_postgresql_connection -def lambda_handler(event, context): - for record in event['Records']: - print(f"New record: {record}") - bucket = record['s3']['bucket']['name'] - key = record['s3']['object']['key'] - conn = get_postgresql_connection() - s3 = boto3.client('s3') - obj = s3.get_object(Bucket=bucket, Key=key) - tar_bytes = io.BytesIO(obj['Body'].read()) - - # Extract .json inside the tar.gz - with tarfile.open(fileobj=tar_bytes, mode='r:gz') as tar: - for member in tar.getmembers(): - if member.name == "output" and member.isfile(): - file = tar.extractfile(member) - results = json.load(file) - print(f"Extracted JSON: {results}") - break - - if not results: - folderSplit = key.split('/') - type = folderSplit[0] - cursor = conn.cursor() - query = "SELECT * FROM comprehend_jobs WHERE entities_path = %s or sentiment_path = %s or key_phrases_path = %s" - cursor.execute(query, (key, key, key)) - row = cursor.fetchone() - if row: - # Download the file object - response = s3.get_object(Bucket=bucket, Key=row['input_s3_uri']) - - # Read CSV into DataFrame - input_csv = pd.read_csv(io.BytesIO(response['Body'].read())) - for row in results: - if type == 'entities': - location_mentions = ', '.join([entity['Text'] for entity in row['Entities'] if entity['Type'] == 'LOCATION']) - officials_involved = ', '.join([entity['Text'] for entity in row['Entities'] if entity['Type'] == 'PERSON']) - relevance_category = ', '.join([entity['Text'] for entity in row['Entities'] if entity['Type'] == 'TITLE']) - if not location_mentions: - cursor.execute("""update articles set location_mentions = %s where articles_id = %s""", (location_mentions, input_article['articles_id'])) - if not officials_involved: - cursor.execute("""update articles set officials_involved = %s where articles_id = %s""", (officials_involved, input_article['articles_id'])) - if not relevance_category: - cursor.execute("""update articles set relevance_category = %s where articles_id = %s""", (relevance_category, input_article['articles_id'])) - elif type == 'sentiment': - sentiment = row.get('Sentiment', 'NEUTRAL') - if not sentiment: - cursor.execute("""update articles set sentiment = %s where articles_id = %s""", (sentiment, input_article['articles_id'])) - elif type == 'keyphrases': - key_phrases = ', '.join(row.get('KeyPhrases', [])) - if not key_phrases: - cursor.execute("""update articles set key_phrases = %s where articles_id = %s""", (key_phrases, input_article['articles_id'])) - line_number = row['Line'] - input_article = input_csv[line_number] - - cursor.close() - conn.close() \ No newline at end of file diff --git a/backend/cluster-prioritization/Utils.py b/backend/cluster-prioritization/Utils.py deleted file mode 100644 index 41ed2b4..0000000 --- a/backend/cluster-prioritization/Utils.py +++ /dev/null @@ -1,25 +0,0 @@ -import json -import psycopg2 -def get_postgresql_connection(): - '''get the creds from local config''' - - """ - Establish a connection to a PostgreSQL database. - - Parameters: - host (str): The hostname of the PostgreSQL server. - database (str): The name of the database to connect to. - user (str): The username to connect with. - password (str): The password for the user. - - Returns: - psycopg2.extensions.connection: A connection object to the PostgreSQL database. - """ - try: - with open("pg_config.json") as f: - config = json.load(f) - conn = psycopg2.connect(**config) - return conn - except psycopg2.Error as e: - print("Error connecting to PostgreSQL database:", e) - return None \ No newline at end of file diff --git a/backend/cluster-prioritization/cluster-prioritization.py b/backend/cluster-prioritization/cluster-prioritization.py deleted file mode 100644 index f0a54b0..0000000 --- a/backend/cluster-prioritization/cluster-prioritization.py +++ /dev/null @@ -1,44 +0,0 @@ -import boto3 -import tarfile -import json -import io -from Utils import get_postgresql_connection -import datetime - - - -def lambda_handler(event, context): - try: - conn = get_postgresql_connection() - cursor = conn.cursor() - query = "SELECT * FROM clusters WHERE TO_DATE(startdate, 'DD/MM/YYYY') >= CURRENT_DATE ORDER BY clusters.referencecount DESC limit 20;" - cursor.execute(query) - rows = cursor.fetchall() - if rows: - for row in rows: - print(f"ID: {row[0]}, Priority: {row[1]}") - update_query = """ - WITH ranked AS ( - SELECT id, - 21 - ROW_NUMBER() OVER (ORDER BY clusters.referencecount DESC) AS priority - FROM clusters - WHERE TO_DATE(startdate, 'DD/MM/YYYY') >= CURRENT_DATE - LIMIT 20 - ) - UPDATE clusters t - SET priority = r.priority - FROM ranked r - WHERE t.id = r.id; - """ - cursor.execute(update_query) - conn.commit() - cursor.close() - ## delete the s3 object - # s3.delete_object(Bucket=bucket, Key=result['input_s3_uri']) - conn.close() - except Exception as e: - print(f"Error processing record: {e}") - return { - 'statusCode': 500, - 'body': json.dumps({'error': str(e)}) - } \ No newline at end of file diff --git a/backend/clustering_service/Utils.py b/backend/clustering_service/Utils.py deleted file mode 100644 index 41ed2b4..0000000 --- a/backend/clustering_service/Utils.py +++ /dev/null @@ -1,25 +0,0 @@ -import json -import psycopg2 -def get_postgresql_connection(): - '''get the creds from local config''' - - """ - Establish a connection to a PostgreSQL database. - - Parameters: - host (str): The hostname of the PostgreSQL server. - database (str): The name of the database to connect to. - user (str): The username to connect with. - password (str): The password for the user. - - Returns: - psycopg2.extensions.connection: A connection object to the PostgreSQL database. - """ - try: - with open("pg_config.json") as f: - config = json.load(f) - conn = psycopg2.connect(**config) - return conn - except psycopg2.Error as e: - print("Error connecting to PostgreSQL database:", e) - return None \ No newline at end of file diff --git a/backend/input_handler/Utils.py b/backend/input_handler/Utils.py deleted file mode 100644 index 41ed2b4..0000000 --- a/backend/input_handler/Utils.py +++ /dev/null @@ -1,25 +0,0 @@ -import json -import psycopg2 -def get_postgresql_connection(): - '''get the creds from local config''' - - """ - Establish a connection to a PostgreSQL database. - - Parameters: - host (str): The hostname of the PostgreSQL server. - database (str): The name of the database to connect to. - user (str): The username to connect with. - password (str): The password for the user. - - Returns: - psycopg2.extensions.connection: A connection object to the PostgreSQL database. - """ - try: - with open("pg_config.json") as f: - config = json.load(f) - conn = psycopg2.connect(**config) - return conn - except psycopg2.Error as e: - print("Error connecting to PostgreSQL database:", e) - return None \ No newline at end of file diff --git a/backend/output_handler/Utils.py b/backend/output_handler/Utils.py deleted file mode 100644 index 41ed2b4..0000000 --- a/backend/output_handler/Utils.py +++ /dev/null @@ -1,25 +0,0 @@ -import json -import psycopg2 -def get_postgresql_connection(): - '''get the creds from local config''' - - """ - Establish a connection to a PostgreSQL database. - - Parameters: - host (str): The hostname of the PostgreSQL server. - database (str): The name of the database to connect to. - user (str): The username to connect with. - password (str): The password for the user. - - Returns: - psycopg2.extensions.connection: A connection object to the PostgreSQL database. - """ - try: - with open("pg_config.json") as f: - config = json.load(f) - conn = psycopg2.connect(**config) - return conn - except psycopg2.Error as e: - print("Error connecting to PostgreSQL database:", e) - return None \ No newline at end of file