From 0542c3fe6488f24b8ea636fddd90a854e9780583 Mon Sep 17 00:00:00 2001 From: Cllspy Date: Sat, 22 Nov 2025 11:24:51 -0300 Subject: [PATCH 1/2] update --- .github/workflows/nlp-pipeline.yml | 29 ++++++++++++++++++++++++++ .gitignore | 3 ++- requirements.txt | 3 ++- src/train.py | 33 ++++++++++++++++++++++++++++++ tests/__init__.py | 0 tests/test_train.py | 10 +++++++++ 6 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/nlp-pipeline.yml create mode 100644 tests/__init__.py create mode 100644 tests/test_train.py diff --git a/.github/workflows/nlp-pipeline.yml b/.github/workflows/nlp-pipeline.yml new file mode 100644 index 0000000..78335fa --- /dev/null +++ b/.github/workflows/nlp-pipeline.yml @@ -0,0 +1,29 @@ +name: Python CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v2 + + - name: Set Up Python + uses: actions/setup-python@v2 + with: + python-version: "3.11" + + - name: Install Dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Runs Tests + run: pytest diff --git a/.gitignore b/.gitignore index 906d2c6..dbce6a3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ venv/ -mlruns/ \ No newline at end of file +mlruns/ +__pycache__/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ad0d8fd..ce0c3e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ dvc pandas numpy scikit-learn -nltk \ No newline at end of file +nltk +pytest \ No newline at end of file diff --git a/src/train.py b/src/train.py index e69de29..6b3db9f 100644 --- a/src/train.py +++ b/src/train.py @@ -0,0 +1,33 @@ +import re +import nltk +import pandas as pd + +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer + + +class ProcessData: + def __init__(self, data_path: str): + self.data_path = data_path + + def load_data(self): + data = pd.read_parquet(self.data_path) + + topic_labels = { + 0: "Society & Culture", + 1: "Science & Mathematics", + 2: "Health", + 3: "Education & Reference", + 4: "Computers & Internet", + 5: "Sports", + 6: "Business & Finance", + 7: "Entertainment & Music", + 8: "Family & Relationships", + 9: "Politics & Government" + } + + data["topic_name"] = data["topic"].map(topic_labels) + + return data + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_train.py b/tests/test_train.py new file mode 100644 index 0000000..44cc19a --- /dev/null +++ b/tests/test_train.py @@ -0,0 +1,10 @@ +import pandas as pd +import pytest +from src.train import ProcessData + + +def test_load_data(): + process_data = ProcessData("data\data_combined.parquet") + data = process_data.load_data() + + assert "topic_name" in data.columns From 8d82cd72e4314c16f9a6e4fa44c32f79576048ae Mon Sep 17 00:00:00 2001 From: Cllspy Date: Sat, 22 Nov 2025 11:30:05 -0300 Subject: [PATCH 2/2] commit --- tests/test_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_train.py b/tests/test_train.py index 44cc19a..63739a0 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -4,7 +4,7 @@ def test_load_data(): - process_data = ProcessData("data\data_combined.parquet") + process_data = ProcessData("data/data_combined.parquet") data = process_data.load_data() assert "topic_name" in data.columns