CllsPy · CllsPy · Nov 22, 2025 · Nov 22, 2025
diff --git a/.github/workflows/nlp-pipeline.yml b/.github/workflows/nlp-pipeline.yml
@@ -0,0 +1,29 @@
+name: Python CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v2
+
+      - name: Set Up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.11"
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Runs Tests
+        run: pytest
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 venv/
-mlruns/
+mlruns/
+__pycache__/
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ dvc
 pandas
 numpy
 scikit-learn
-nltk
+nltk
+pytest
diff --git a/src/train.py b/src/train.py
@@ -0,0 +1,33 @@
+import re
+import nltk
+import pandas as pd
+
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+
+
+class ProcessData:
+    def __init__(self, data_path: str):
+        self.data_path = data_path
+
+    def load_data(self):
+        data = pd.read_parquet(self.data_path)
+
+        topic_labels = {
+            0: "Society & Culture",
+            1: "Science & Mathematics",
+            2: "Health",
+            3: "Education & Reference",
+            4: "Computers & Internet",
+            5: "Sports",
+            6: "Business & Finance",
+            7: "Entertainment & Music",
+            8: "Family & Relationships",
+            9: "Politics & Government"
+        }
+
+        data["topic_name"] = data["topic"].map(topic_labels)
+
+        return data
+
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_train.py b/tests/test_train.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import pytest
+from src.train import ProcessData  
+
+
+def test_load_data():
+    process_data = ProcessData("data/data_combined.parquet")
+    data = process_data.load_data()
+
+    assert "topic_name" in data.columns
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,5 @@ dvc @@
     pandas
     numpy
     scikit-learn
-    nltk
+    nltk
+    pytest