Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,11 @@ mlruns/

#data
*.xlsx
*.csv
*.ipynb
*.txt
!tests/
!tests/data_NLP/
*.csv

#git hooks
.files_exceptions
Expand Down
22 changes: 11 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
FROM python:3.10

WORKDIR /project

ENV DEBIAN_FRONTEND=noninteractive

FROM python:3.9
RUN apt-get update && \
apt-get install -y --no-install-recommends build-essential git rsync software-properties-common ffmpeg libsm6 libxext6 && \
rm -rf /var/lib/apt/lists/*
apt-get install -y build-essential git rsync software-properties-common --allow-unauthenticated

WORKDIR /project
ENV VIRTUAL_ENV=/opt/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

ENV PYTHONPATH="/mlflow/projects/code/:$PYTHONPATH"

COPY . .
COPY --chown=root . .

RUN python -m pip install --upgrade pip && \
python -m pip install --no-cache-dir -r requirements.txt
# install requirements
RUN python -m pip install --upgrade pip && python -m pip install wheel
RUN python -m pip install --ignore-install ruamel-yaml -r requirements.txt
22 changes: 22 additions & 0 deletions config_NLP/config.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[server]
MLFLOW_S3_ENDPOINT_URL = http://10.36.191.201:8002
MLFLOW_TRACKING_URI = http://10.36.191.201:85
LOCAL_MLFLOW_S3_ENDPOINT_URL = http://0.0.0.0:8002
LOCAL_REMOTE_SERVER_URI = http://0.0.0.0:85
ARTIFACT_PATH = s3://mlflow

[project]
NAME = PROJECT_NLP

[data]
DATA_PATH = tests/data_NLP/synthetic_example.csv

[training]
MODEL_NAME = bert-base-uncased
LEARNING_RATE = 1e-5
BATCH_SIZE = 32
N_EPOCHS = 8
KFOLD = True
N_FOLDS = 5
RANDOM_STATE = 42
SAVE_MODEL = False
23 changes: 23 additions & 0 deletions config_NLP/local_config.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[server]
MLFLOW_S3_ENDPOINT_URL = http://localhost:8002
MLFLOW_TRACKING_URI = http://localhost:85
ARTIFACT_PATH = s3://mlflow

[project]
NAME = PROJECT_NLP

[system]
USE_GPU = 0

[data]
DATA_PATH = tests/data_NLP/synthetic_example.csv

[training]
MODEL_NAME = bert-base-uncased
LEARNING_RATE = 1e-5
BATCH_SIZE = 32
N_EPOCHS = 8
N_FOLDS = 5
RANDOM_STATE = 42
KFOLD = False
SAVE_MODEL = False
174 changes: 174 additions & 0 deletions project_NLP/NLPDataModule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import pandas as pd
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from project_NLP.NLPDataset import NLPDataset
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import mlflow


class NLPDataModule(pl.LightningDataModule):
def __init__(
self,
data_path,
tokenizer,
batch_size=8,
max_token_len=256,
num_workers=0,
fold_indices=None,
random_state=1,
):
"""Initialize the DataModule. Set batch size, data path, tokenizer,
maximum token length, label columns, sample status and number of workers.

Args:
data_path (str): Path to the data file.
tokenizer: Tokenizer to be used.
batch_size (int, optional): Size of the data batches. Default is 8.
max_token_len (int, optional): Maximum length of tokens. Default is 256.
num_workers (int, optional): Number of workers. Default is 0.
sample (bool, optional): If True, use WeightedRandomSampler. Default is False.
"""

super().__init__()
self.batch_size = batch_size
self.data_path = data_path
self.tokenizer = tokenizer
self.max_token_len = max_token_len
self.num_workers = num_workers
self.fold_indices = fold_indices
self.random_state = random_state
self.df = None
self.train_df = None
self.val_df = None
self.label_columns = None
self.train_dataset = None
self.val_dataset = None
self.read_csv_data()

def setup(self, stage=None):
"""
Set up the data module. Parse the data and create train and validation datasets.
"""
self.parse_df_data()
if self.fold_indices:
train_indices, val_indices = self.fold_indices
self.train_df = self.df.iloc[train_indices]
self.val_df = self.df.iloc[val_indices]
else:
self.default_train_val_split()

self.train_dataset = NLPDataset(
self.tokenizer,
self.train_df,
self.label_columns,
self.max_token_len,
)

self.val_dataset = NLPDataset(
self.tokenizer,
self.val_df,
self.label_columns,
self.max_token_len,
)

def train_dataloader(self):
"""
Create and return a data loader for the training data.

Returns:
DataLoader: Data loader for the training data.
"""

return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=True,
)

def val_dataloader(self):
"""
Create and return a data loader for the validation data.

Returns:
DataLoader: Data loader for the validation data.
"""

return DataLoader(
self.val_dataset,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)

def read_csv_data(self):
self.df = pd.read_csv(self.data_path)
self.df = self.df.drop_duplicates()

def parse_df_data(self):
"""
Parse the data. Create label dictionary and add 'label' and 'data_type' columns.
Split the data into train and validation dataframes. Store the label columns in a pickle file.
"""

self.label_columns = self.df.ClassLabel.unique()
self.label_dict = {}
for index, possible_label in enumerate(self.label_columns):
self.label_dict[possible_label] = index
self.df[possible_label] = 0
self.df.loc[self.df["ClassLabel"] == possible_label, [possible_label]] = 1

self.df["label"] = self.df.ClassLabel.replace(self.label_dict)

self.length_label_dict = len(self.label_dict)
self.num_labels = self.length_label_dict
self.num_classes = len(list(set(self.df.label)))

with open("label_columns.data", "wb") as filehandle:
pickle.dump(self.label_columns, filehandle)

class_counts = self.df["ClassLabel"].value_counts()
for class_label, count in class_counts.items():
mlflow.log_param(f"class_{class_label}_count", count)

def default_train_val_split(self):
train_df, val_df = train_test_split(
self.df,
test_size=0.2,
random_state=self.random_state,
stratify=self.df["label"],
)
self.train_df = train_df
self.val_df = val_df

def steps_per_epoch(self):
"""
Calculate and return the number of steps per epoch based on the batch size.

Returns:
int: Number of steps per epoch.
"""
if self.train_df is None or len(self.train_df) == 0:
self.parse_df_data()
self.default_train_val_split()
return len(self.train_df) // self.batch_size

def dataset_stats(self, dataset) -> dict:
"""
Calculate and return a dictionary of dataset statistics including label distribution and number of samples.

Args:
dataset (pandas.DataFrame): The dataset to analyze.

Returns:
dict: Dictionary of dataset statistics.
"""
stats = {}
stats["n_samples"] = len(dataset)
label_counts = dataset["ClassLabel"].value_counts()
label_counts_dict = label_counts.to_dict()
stats["label_counts"] = label_counts_dict
return stats
75 changes: 75 additions & 0 deletions project_NLP/NLPDataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd
import torch
from torch.utils.data import Dataset
from typing import Optional
from transformers import BertTokenizerFast as BertTokenizer

import configparser


class NLPDataset(Dataset):
def __init__(
self,
tokenizer=None,
data: Optional[pd.DataFrame] = None,
label_columns: list = None,
max_token_len: int = 256,
):
if tokenizer == None:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
else:
self.tokenizer = tokenizer
self.data = data
self.max_token_len = max_token_len
self.label_columns = label_columns

def __len__(self):
return len(self.data)

def __getitem__(self, index: int):
"""
Given an index, return a dictionary containing 'diag_final', 'input_ids', 'attention_mask' and 'labels'
after encoding a row of the data.

Args:
index (int): Index of the data row.

Returns:
dict: A dictionary containing 'diag_final', 'input_ids', 'attention_mask' and 'labels'
for the data row at the given index.
"""

data_row = self.data.iloc[index]
diag_final = [data_row["TextString"]]
labels = data_row[self.label_columns] # nth index is nth class
encoding = self.encoder(diag_final)
return dict(
diag_final=diag_final,
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
labels=torch.FloatTensor(labels),
)

def encoder(self, diag_final):
"""
Encode a given list of text strings using the tokenizer set during initialization.
Args:
diag_final (list): List of text strings to be encoded.

Returns:
dict: A dictionary containing the following keys:
- 'input_ids': Tensor of token ids obtained from the text strings.
- 'attention_mask': Tensor where positions with original tokens are represented by 1 and positions with
padding are represented by 0.
"""

return self.tokenizer.batch_encode_plus(
diag_final,
add_special_tokens=True,
max_length=self.max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)
Loading