From 0542c3fe6488f24b8ea636fddd90a854e9780583 Mon Sep 17 00:00:00 2001
From: Cllspy <ifal.edu.2017@gmail.com>
Date: Sat, 22 Nov 2025 11:24:51 -0300
Subject: [PATCH 1/2] update

---
 .github/workflows/nlp-pipeline.yml | 29 ++++++++++++++++++++++++++
 .gitignore                         |  3 ++-
 requirements.txt                   |  3 ++-
 src/train.py                       | 33 ++++++++++++++++++++++++++++++
 tests/__init__.py                  |  0
 tests/test_train.py                | 10 +++++++++
 6 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/nlp-pipeline.yml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_train.py

diff --git a/.github/workflows/nlp-pipeline.yml b/.github/workflows/nlp-pipeline.yml
new file mode 100644
index 0000000..78335fa
--- /dev/null
+++ b/.github/workflows/nlp-pipeline.yml
@@ -0,0 +1,29 @@
+name: Python CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v2
+
+      - name: Set Up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.11"
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Runs Tests
+        run: pytest
diff --git a/.gitignore b/.gitignore
index 906d2c6..dbce6a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 venv/
-mlruns/
\ No newline at end of file
+mlruns/
+__pycache__/
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index ad0d8fd..ce0c3e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ dvc
 pandas
 numpy
 scikit-learn
-nltk
\ No newline at end of file
+nltk
+pytest
\ No newline at end of file
diff --git a/src/train.py b/src/train.py
index e69de29..6b3db9f 100644
--- a/src/train.py
+++ b/src/train.py
@@ -0,0 +1,33 @@
+import re
+import nltk
+import pandas as pd
+
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+
+
+class ProcessData:
+    def __init__(self, data_path: str):
+        self.data_path = data_path
+    
+    def load_data(self):
+        data = pd.read_parquet(self.data_path)
+
+        topic_labels = {
+            0: "Society & Culture",
+            1: "Science & Mathematics",
+            2: "Health",
+            3: "Education & Reference",
+            4: "Computers & Internet",
+            5: "Sports",
+            6: "Business & Finance",
+            7: "Entertainment & Music",
+            8: "Family & Relationships",
+            9: "Politics & Government"
+        }
+        
+        data["topic_name"] = data["topic"].map(topic_labels)
+
+        return data
+        
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_train.py b/tests/test_train.py
new file mode 100644
index 0000000..44cc19a
--- /dev/null
+++ b/tests/test_train.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import pytest
+from src.train import ProcessData  
+
+
+def test_load_data():
+    process_data = ProcessData("data\data_combined.parquet")
+    data = process_data.load_data()
+
+    assert "topic_name" in data.columns

From 8d82cd72e4314c16f9a6e4fa44c32f79576048ae Mon Sep 17 00:00:00 2001
From: Cllspy <ifal.edu.2017@gmail.com>
Date: Sat, 22 Nov 2025 11:30:05 -0300
Subject: [PATCH 2/2] commit

---
 tests/test_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_train.py b/tests/test_train.py
index 44cc19a..63739a0 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -4,7 +4,7 @@
 
 
 def test_load_data():
-    process_data = ProcessData("data\data_combined.parquet")
+    process_data = ProcessData("data/data_combined.parquet")
     data = process_data.load_data()
 
     assert "topic_name" in data.columns