diff --git a/.github/workflows/nlp-pipeline.yml b/.github/workflows/nlp-pipeline.yml new file mode 100644 index 0000000..78335fa --- /dev/null +++ b/.github/workflows/nlp-pipeline.yml @@ -0,0 +1,29 @@ +name: Python CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v2 + + - name: Set Up Python + uses: actions/setup-python@v2 + with: + python-version: "3.11" + + - name: Install Dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Runs Tests + run: pytest diff --git a/.gitignore b/.gitignore index 906d2c6..dbce6a3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ venv/ -mlruns/ \ No newline at end of file +mlruns/ +__pycache__/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ad0d8fd..ce0c3e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ dvc pandas numpy scikit-learn -nltk \ No newline at end of file +nltk +pytest \ No newline at end of file diff --git a/src/train.py b/src/train.py index e69de29..6b3db9f 100644 --- a/src/train.py +++ b/src/train.py @@ -0,0 +1,33 @@ +import re +import nltk +import pandas as pd + +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer + + +class ProcessData: + def __init__(self, data_path: str): + self.data_path = data_path + + def load_data(self): + data = pd.read_parquet(self.data_path) + + topic_labels = { + 0: "Society & Culture", + 1: "Science & Mathematics", + 2: "Health", + 3: "Education & Reference", + 4: "Computers & Internet", + 5: "Sports", + 6: "Business & Finance", + 7: "Entertainment & Music", + 8: "Family & Relationships", + 9: "Politics & Government" + } + + data["topic_name"] = data["topic"].map(topic_labels) + + return data + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_train.py b/tests/test_train.py new file mode 100644 index 0000000..63739a0 --- /dev/null +++ b/tests/test_train.py @@ -0,0 +1,10 @@ +import pandas as pd +import pytest +from src.train import ProcessData + + +def test_load_data(): + process_data = ProcessData("data/data_combined.parquet") + data = process_data.load_data() + + assert "topic_name" in data.columns