Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/nlp-pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Python CI

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest
steps:
- name: checkout code
uses: actions/checkout@v2

- name: Set Up Python
uses: actions/setup-python@v2
with:
python-version: "3.11"

- name: Install Dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt

- name: Runs Tests
run: pytest
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
venv/
mlruns/
mlruns/
__pycache__/
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ dvc
pandas
numpy
scikit-learn
nltk
nltk
pytest
33 changes: 33 additions & 0 deletions src/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


class ProcessData:
def __init__(self, data_path: str):
self.data_path = data_path

def load_data(self):
data = pd.read_parquet(self.data_path)

topic_labels = {
0: "Society & Culture",
1: "Science & Mathematics",
2: "Health",
3: "Education & Reference",
4: "Computers & Internet",
5: "Sports",
6: "Business & Finance",
7: "Entertainment & Music",
8: "Family & Relationships",
9: "Politics & Government"
}

data["topic_name"] = data["topic"].map(topic_labels)

return data

Empty file added tests/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions tests/test_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pandas as pd
import pytest
from src.train import ProcessData


def test_load_data():
process_data = ProcessData("data/data_combined.parquet")
data = process_data.load_data()

assert "topic_name" in data.columns
Loading