diff --git a/.github/workflows/nlp-pipeline.yml b/.github/workflows/nlp-pipeline.yml
new file mode 100644
index 0000000..78335fa
--- /dev/null
+++ b/.github/workflows/nlp-pipeline.yml
@@ -0,0 +1,29 @@
+name: Python CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v2
+
+      - name: Set Up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.11"
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Runs Tests
+        run: pytest
diff --git a/.gitignore b/.gitignore
index 906d2c6..dbce6a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 venv/
-mlruns/
\ No newline at end of file
+mlruns/
+__pycache__/
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index ad0d8fd..ce0c3e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ dvc
 pandas
 numpy
 scikit-learn
-nltk
\ No newline at end of file
+nltk
+pytest
\ No newline at end of file
diff --git a/src/train.py b/src/train.py
index e69de29..6b3db9f 100644
--- a/src/train.py
+++ b/src/train.py
@@ -0,0 +1,33 @@
+import re
+import nltk
+import pandas as pd
+
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+
+
+class ProcessData:
+    def __init__(self, data_path: str):
+        self.data_path = data_path
+    
+    def load_data(self):
+        data = pd.read_parquet(self.data_path)
+
+        topic_labels = {
+            0: "Society & Culture",
+            1: "Science & Mathematics",
+            2: "Health",
+            3: "Education & Reference",
+            4: "Computers & Internet",
+            5: "Sports",
+            6: "Business & Finance",
+            7: "Entertainment & Music",
+            8: "Family & Relationships",
+            9: "Politics & Government"
+        }
+        
+        data["topic_name"] = data["topic"].map(topic_labels)
+
+        return data
+        
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_train.py b/tests/test_train.py
new file mode 100644
index 0000000..63739a0
--- /dev/null
+++ b/tests/test_train.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import pytest
+from src.train import ProcessData  
+
+
+def test_load_data():
+    process_data = ProcessData("data/data_combined.parquet")
+    data = process_data.load_data()
+
+    assert "topic_name" in data.columns