diff --git a/Structured_data_template/.DS_Store b/Structured_data_template/.DS_Store new file mode 100644 index 0000000..0a9c1e8 Binary files /dev/null and b/Structured_data_template/.DS_Store differ diff --git a/Structured_data_template/.gitattributes b/Structured_data_template/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/Structured_data_template/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/Structured_data_template/.github/ISSUE_TEMPLATE/CAPA.md b/Structured_data_template/.github/ISSUE_TEMPLATE/CAPA.md new file mode 100644 index 0000000..3de822d --- /dev/null +++ b/Structured_data_template/.github/ISSUE_TEMPLATE/CAPA.md @@ -0,0 +1,36 @@ +--- +name: CAPA +about: File a corrective action or preventative action (CAPA) report +title: "[CAPA]: " +labels: '' +assignees: '' + +--- + +*This template is used to file a corrective action or preventative action (CAPA) report as specified by [CSC-QMS: SOP-016](https://github.com/GSTT-CSC/CSC-QMS)* + +*Please complete all of the following fields* + +**Type** +*Is this a corrective action or a preventative action? (delete as appropriate)* + +Corrective/Preventative action + +**Datix ID** +*If this CAPA has an associated Datix report, please enter the Datix ID here.* + +**Description** +*Please describe the action. e.g., No contact details provided as part of product information* + +**Immediate Actions** +*Describe any immeadiate actions needed to ensure safe control of the product/QMS process etc.* + +**Root Cause** +*Please describe the root cause. e.g., Loss of information during development* + +**Potentially Adverse Implications** +*Describe any potentially adverse implications to the action e.g., Release of product version and process update* + +**Time Frame for Completion** +*Confirm the time frame within which the actions must be completed, confirmed by the QMO.* +*Consult CSC PR.014 for Incident Reporting timeframes* diff --git a/Structured_data_template/.github/ISSUE_TEMPLATE/meeting-minutes.md b/Structured_data_template/.github/ISSUE_TEMPLATE/meeting-minutes.md new file mode 100644 index 0000000..da62647 --- /dev/null +++ b/Structured_data_template/.github/ISSUE_TEMPLATE/meeting-minutes.md @@ -0,0 +1,17 @@ +--- +name: Meeting Minutes +about: Log agenda and minutes +title: "[Minutes] - Weekly Meeting - DD-MM-YYYY" +labels: minutes +assignees: '' + +--- + +**Attendance/Apologies** + +**Agenda** + +**Actions** + +- [ ] Action 1 +- [ ] Action 2 diff --git a/Structured_data_template/.github/ISSUE_TEMPLATE/systematic_review.md b/Structured_data_template/.github/ISSUE_TEMPLATE/systematic_review.md new file mode 100644 index 0000000..5d05da6 --- /dev/null +++ b/Structured_data_template/.github/ISSUE_TEMPLATE/systematic_review.md @@ -0,0 +1,41 @@ +--- +name: Systematic Review +about: Log outcomes of systematic review of documentation at key project milestones +title: "[Systematic Review] - DD-MM-YYYY" +labels: minutes +assignees: '' + +--- + +### Stakeholders + +| Name | Role | +|------|------| +| | | + +### Project Milestone +> At which milestone is this review taking place? + +Requirements / Proof of Concept / Deployment / Before Prospective Study / Clinical release / Other + + +### Requirements Review + +> Are requirements still adequate and non-conflicting? + +### Hazard log review - CRM + +> Have any new hazards been identified? + +### Design specification review + +> Have extra design spec items been identified? + +### Verification and validation review + +> Have any unit tests or manual tests failed? + +### Outstanding issues + + +### Resource issues diff --git a/Structured_data_template/.github/PULL_REQUEST_TEMPLATE.md b/Structured_data_template/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..d6a080d --- /dev/null +++ b/Structured_data_template/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,24 @@ +### Linked Issue(s) +> Explicitly tag the issue linked to this pull request, if any. + +### Summary of changes +*Briefly describe the changes in this PR.* + +### Reason for changes +*Explain why these changes are being made* + + + +### Clinical Risk Management Review +#### Summary of the clinical risk management review: +Briefly describe the outcome of the review and any actions taken. + +#### Hazard Impact +**Related Hazards:** *Include references to the Hazard Log e.g. HZ-xxx, HZ-yyy* + +**Impact on Hazards:** +- Mitigates: *List hazards it helps mitigate (e.g., HZ-xxx, HZ-yyy)*. +- Induces: *List hazards it might induce (e.g., HZ-xxx, HZ-yyy)*. + +## Quality Assurance +- [ ] Unit tests added diff --git a/Structured_data_template/.github/hooks/pre-commit b/Structured_data_template/.github/hooks/pre-commit new file mode 100644 index 0000000..68e349b --- /dev/null +++ b/Structured_data_template/.github/hooks/pre-commit @@ -0,0 +1,75 @@ +#!/bin/sh + +# REGEX PATTERNS +FORBIDDEN_ADDRESS="^[A-PR-UWYZ]([0-9]{1,2}|([A-HK-Y][0-9]([0-9ABEHMNPRV-Y])?)|[0-9][A-HJKPS-UW])?[[:space:]]?[0-9][ABD-HJLNP-UW-Z]{2}$" +FORBIDDEN_PATIENT_ID="^[a-zA-Z][0-9]{6}$|^[0-9]{7}[a-zA-Z]$" +FORBIDDEN_PHONE="(\+44|07)[0-9]{9}" +FORBIDDEN_ACCESSION_NUMBER="sp-[0-9]{2}-[0-9]{7}" +FORBIDDEN_DOB="[0-1][0-9]/[0-3][0-9]/[1-2][0-9]{3}" +FORBIDDEN_DATE_TIME="[0-1][0-9]/[0-3][0-9]/[1-2][0-9]{3}\s[0-2][0-9]:[0-5][0-9]:[0-5][0-9]" +FORBIDDEN_NHS="[0-9]{10}" + +git_verification_patterns=( $FORBIDDEN_ADDRESS $FORBIDDEN_PATIENT_ID $FORBIDDEN_PHONE $FORBIDDEN_ACCESSION_NUMBER $FORBIDDEN_DATE_TIME $FORBIDDEN_DOB $FORBIDDEN_NHS) +git_verification_patterns_desc=("Address" "Patient ID" "Phone number" "Accession Number" "Date and Time" "Date of Birth" "NHS Number") + +# Get modified files +FILES_MODIFIED=$(git diff --cached --name-only) + +NUM_FILES_CHECKED=0 +NUM_FILES_OFFENCES=0 + +# Exceptions +exception_file=".sensitive_exceptions" +exclusion_file=".files_exceptions" + +echo "-- RUNNING SENSITIVE DATA CHECKS ----------------------------------------" + +for F in $FILES_MODIFIED +do + F_basename=$(basename $F) + if grep -Fiq -- "$F_basename" $exclusion_file; then + continue + fi + + for i in "${!git_verification_patterns[@]}"; do + MATCHES=$(egrep -i --line-number "${git_verification_patterns[$i]}" "$F" || true) + + for MATCH in $MATCHES; do + IFS=':' read -ra PARTS <<< "$MATCH" + LINE_NUMBER=${PARTS[0]} + CONTENT=${PARTS[1]} + + # Skip exceptions + if echo "$CONTENT" | grep -Fiq -f $exception_file; then + continue + fi + + echo "FILE: $F" + echo " DESC: ${git_verification_patterns_desc[$i]}" + echo " MATCH: $MATCH" + echo " " + + NUM_FILES_OFFENCES=$((NUM_FILES_OFFENCES+1)) + done + done + + NUM_FILES_CHECKED=$((NUM_FILES_CHECKED+1)) +done + +echo "-- SUMMARY --------------------------------------------------------------" +echo "" +echo " Files Checked: $NUM_FILES_CHECKED" +echo " Num File Offences: $NUM_FILES_OFFENCES" +if [ $NUM_FILES_OFFENCES -gt 0 ]; then + echo " Status: FAIL" + echo " " +else + echo " Status: OK" + echo " " +fi + +if [ $NUM_FILES_OFFENCES -gt 0 ]; then + exit 1 +else + exit 0 +fi diff --git a/Structured_data_template/.github/workflows/development_test.yml b/Structured_data_template/.github/workflows/development_test.yml new file mode 100644 index 0000000..e6e79a0 --- /dev/null +++ b/Structured_data_template/.github/workflows/development_test.yml @@ -0,0 +1,46 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Development tests + +on: + pull_request: + +env: + PROJECT_NAME: template # the project name + APPLICATION_DIR: project # the name of the main directory that contains the project code + TEST_DIR: tests # the name of the directory that contains the tests + +jobs: + + build-and-test: + + runs-on: ubuntu-latest + + steps: + - name: checkout + uses: actions/checkout@v2 + + - name: Build and tag image + run: docker build -t $PROJECT_NAME:latest . + + - name: Setup flake8 annotations + uses: rbialon/flake8-annotations@v1 + + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + docker run --mount type=bind,source=$(pwd),target=/$APPLICATION_DIR $PROJECT_NAME flake8 /$APPLICATION_DIR --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + docker run --mount type=bind,source=$(pwd),target=/$APPLICATION_DIR $PROJECT_NAME flake8 /$APPLICATION_DIR --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Test with pytest + run: | + coverage run -m --source=project pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=$APPLICATION_DIR $TEST_DIR/ | tee pytest-coverage.txt + + - name: Pytest coverage comment + id: coverageComment + uses: MishaKav/pytest-coverage-comment@main + with: + pytest-coverage-path: ./pytest-coverage.txt + junitxml-path: ./pytest.xml diff --git a/Structured_data_template/.github/workflows/production_test.yml b/Structured_data_template/.github/workflows/production_test.yml new file mode 100644 index 0000000..5b22e76 --- /dev/null +++ b/Structured_data_template/.github/workflows/production_test.yml @@ -0,0 +1,63 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Production tests + +on: + push: + branches: + - 'main' + - 'master' + - 'release/*' + +env: + PROJECT_NAME: template # the project name + APPLICATION_DIR: project # the name of the main directory that contains the project code + TEST_DIR: tests # the name of the directory that contains the tests + COV_GIST_BADGEID: COV_GIST_BADGEID # this github repo secret should define a gist token see https://github.com/Schneegans/dynamic-badges-action + COV_GIST_NAME: COV_GIST_NAME # this github repo secret should contain the filename of the gist see https://github.com/Schneegans/dynamic-badges-action + +jobs: + + build-and-test: + + runs-on: ubuntu-latest + + steps: + - name: checkout + uses: actions/checkout@v2 + + - name: Build and tag image + run: docker build -t $PROJECT_NAME:latest . + + - name: Setup flake8 annotations + uses: rbialon/flake8-annotations@v1 + + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + docker run --mount type=bind,source=$(pwd),target=/$APPLICATION_DIR $PROJECT_NAME flake8 /$APPLICATION_DIR --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + docker run --mount type=bind,source=$(pwd),target=/$APPLICATION_DIR $PROJECT_NAME flake8 /$APPLICATION_DIR --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Test with pytest + run: | + coverage run -m --source=project pytest --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=$APPLICATION_DIR $TEST_DIR/ | tee pytest-coverage.txt + + - name: Pytest coverage comment + id: coverageComment + uses: MishaKav/pytest-coverage-comment@main + with: + pytest-coverage-path: ./pytest-coverage.txt + junitxml-path: ./pytest.xml + + - name: Create coverage Badge + uses: schneegans/dynamic-badges-action@v1.0.0 + with: + auth: ${{ secrets.PYTEST_COVERAGE_COMMENT }} + gistID: COV_GIST_BADGEID + filename: $COV_GIST_NAME + label: Test coverage + message: ${{ steps.coverageComment.outputs.coverage }} + color: ${{ steps.coverageComment.outputs.color }} + namedLogo: python diff --git a/Structured_data_template/.gitignore b/Structured_data_template/.gitignore new file mode 100644 index 0000000..ed517d4 --- /dev/null +++ b/Structured_data_template/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +structdata2/ +structdata-venv/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Added over default +.idea/ + +pytest.xml +pytest-coverage.txt +mlruns/ + +#data +*.xlsx +*.csv +*.ipynb +*.txt +*.png + +#git hooks +.files_exceptions +.sensitive_exceptions + +#models +*.pkl +*.json +models/ diff --git a/Structured_data_template/README.md b/Structured_data_template/README.md new file mode 100644 index 0000000..9abc899 --- /dev/null +++ b/Structured_data_template/README.md @@ -0,0 +1,80 @@ +# ML Template for Structured Data + +A clean, production-ready template for machine learning on tabular data with automated preprocessing, hyperparameter tuning, and comprehensive evaluation. + + + +## Quick Start + +### 1. Setup +```bash +pip install -r requirements.txt +``` + +### 2. Configure +Edit `train/config/local_config.cfg`: +```ini +[data] +data_path = data/your_dataset.csv +target_column = your_target + +[model] +model_name = xgboost +task = classification +``` + +### 3. Run +```bash +cd train + +# Optional: Hyperparameter tuning +python scripts/tune.py --config config/local_config.cfg + +# Train model +python scripts/train.py --config config/local_config.cfg +``` + +## Project Structure + +``` +project-template/ +├── data/ # Your datasets +├── train/ +│ ├── config/ # Configuration files +│ ├── scripts/ # Training & tuning scripts +│ └── src/ # Core modules +│ ├── DataLoader.py # Data I/O operations +│ ├── DataModule.py # Pipeline orchestration +│ └── utils/ # Utilities +├── models/ # Saved models +└── plots/ # Generated visualizations +``` + +## Supported Models +Random Forest, XGBoost, Logistic Regression, Linear Regression + +## Key Configuration Options + +```ini +# Data preprocessing +[preprocessing.numerical] +age.imputer = median +age.scaler = standard + +[preprocessing.categorical] +category.encoder = onehot +category.encoder_options = {"sparse_output": false} + +# Training +[training] +use_optuna = true # Enable auto-tuning +n_trials = 50 # Optimization trials +use_kfold_cv = true # Cross-validation +n_splits = 5 # CV folds +``` + +## Output + +- **Models**: Saved in `models/` with preprocessors & metadata +- **Metrics**: Logged to MLflow (view at `http://localhost:5000`) +- **Plots**: EDA & evaluation charts in `plots/` diff --git a/Structured_data_template/app/__init__.py_ b/Structured_data_template/app/__init__.py_ new file mode 100644 index 0000000..e69de29 diff --git a/Structured_data_template/plots/correlation_matrix.png b/Structured_data_template/plots/correlation_matrix.png new file mode 100644 index 0000000..b53d7f4 Binary files /dev/null and b/Structured_data_template/plots/correlation_matrix.png differ diff --git a/Structured_data_template/plots/feature_distributions_categorical.png b/Structured_data_template/plots/feature_distributions_categorical.png new file mode 100644 index 0000000..450bd41 Binary files /dev/null and b/Structured_data_template/plots/feature_distributions_categorical.png differ diff --git a/Structured_data_template/plots/feature_distributions_numerical.png b/Structured_data_template/plots/feature_distributions_numerical.png new file mode 100644 index 0000000..8f406ff Binary files /dev/null and b/Structured_data_template/plots/feature_distributions_numerical.png differ diff --git a/Structured_data_template/plots/feature_distributions_target.png b/Structured_data_template/plots/feature_distributions_target.png new file mode 100644 index 0000000..5fd2bca Binary files /dev/null and b/Structured_data_template/plots/feature_distributions_target.png differ diff --git a/Structured_data_template/plots/outlier_analysis.png b/Structured_data_template/plots/outlier_analysis.png new file mode 100644 index 0000000..b0ea6c1 Binary files /dev/null and b/Structured_data_template/plots/outlier_analysis.png differ diff --git a/Structured_data_template/tests/__init__.py b/Structured_data_template/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Structured_data_template/tests/test_DataModule.py b/Structured_data_template/tests/test_DataModule.py new file mode 100644 index 0000000..28c04eb --- /dev/null +++ b/Structured_data_template/tests/test_DataModule.py @@ -0,0 +1,323 @@ +import pytest +import pandas as pd +from pathlib import Path +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.pipeline import Pipeline +import numpy as np + +from project.DataModule import DataModule + +@pytest.fixture(scope="module") +def dummy_data_path(tmp_path_factory): + """Creates a dummy CSV file for testing and returns its path.""" + data = { + 'Gender': ['Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'] * 3, + 'Age': list(range(20, 50)), + 'Height': [1.70, 1.85, 1.65, 1.75, 1.72, 1.80, 1.68, 1.90, 1.60, 1.78] * 3, + 'Weight': [70.5, 90.2, 60.0, 80.1, 75.3, 85.0, 68.9, 95.5, 55.1, 82.7] * 3, + 'Favorite Food': ['Pizza', 'Burger', 'Salad', 'Pasta', 'Sushi', 'Taco', 'Steak', 'Curry', 'Ramen', 'Pho'] * 3, + 'TargetColumn': ['ClassA', 'ClassB', 'ClassC'] * 10 + } + df = pd.DataFrame(data) + + data_dir = tmp_path_factory.mktemp("data") + fn = data_dir / "dummy_data.csv" + + df.to_csv(fn, index=False) + return fn + +@pytest.fixture(scope="module") +def basic_data_module(dummy_data_path): + """Provides a basic DataModule instance for testing.""" + return DataModule(data_path=dummy_data_path, target_column='TargetColumn') + +@pytest.fixture(scope="module") +def configured_data_module(dummy_data_path): + """Provides a DataModule instance with specific configurations.""" + preprocessor_settings = { + 'numerical': { + 'Age': {'scaler': 'standard'}, + 'Height': {'scaler': 'standard'}, + 'Weight': {'scaler': 'standard'} + }, + 'categorical': { + 'Gender': {'encoder': 'onehot', 'encoder_options': {'sparse_output': False}} + } + } + return DataModule( + data_path=dummy_data_path, + target_column='TargetColumn', + columns_to_drop=['Favorite Food'], + preprocessor_settings=preprocessor_settings, + visualise=False, + check_imbalance=True, + stratify=True, + test_size=0.2 + ) + +def test_initialization(basic_data_module): + """Test DataModule initialization.""" + dm = basic_data_module + assert dm.data_path is not None + assert dm.target_column == 'TargetColumn' + assert dm.columns_to_drop == [] + assert dm.preprocessor_settings == {} + assert dm.data is None + assert dm.preprocessor is None + assert dm.visualise is False + assert dm.check_imbalance is False + assert dm.test_size == 0.2 + assert dm.stratify is False + assert dm.random_state == 42 + +def test_load_data(dummy_data_path): + """Test data loading functionality.""" + dm = DataModule(data_path=dummy_data_path, target_column='TargetColumn') + X, y = dm.load_and_prepare() + assert dm.data is not None + assert not dm.data.empty + assert 'TargetColumn' not in X.columns + assert 'TargetColumn' in y.name + assert X.shape[0] == y.shape[0] + +def test_load_and_prepare(dummy_data_path, capsys): + """Test load_and_prepare method, including column dropping and inference.""" + dm = DataModule(data_path=dummy_data_path, target_column='TargetColumn', columns_to_drop=['Weight']) + X, y = dm.load_and_prepare() + assert 'Weight' not in X.columns + assert 'Weight' not in dm.data.columns + assert dm.numerical_features or dm.categorical_features + assert 'Age' in dm.numerical_features + assert 'Gender' in dm.categorical_features + + captured = capsys.readouterr() + assert f"Data loaded successfully from {dummy_data_path}. Shape: (30, 6)" in captured.out + assert "Dropped columns: Weight" in captured.out + + +def test_infer_column_types(dummy_data_path): + """Test column type inference.""" + dm = DataModule(data_path=dummy_data_path, target_column='TargetColumn') + X, _ = dm.load_and_prepare() + assert 'Age' in dm.numerical_features + assert 'Gender' in dm.categorical_features + assert 'Height' in dm.numerical_features + assert 'Weight' in dm.numerical_features + assert 'Favorite Food' in dm.categorical_features + assert 'TargetColumn' not in dm.numerical_features and 'TargetColumn' not in dm.categorical_features + +def test_setup_preprocessor_basic_inference(basic_data_module, capsys): + """ + Test setup_preprocessor with basic inference and no specific settings. + Should result in a ColumnTransformer with no active transformers. + """ + dm = basic_data_module + X, _ = dm.load_and_prepare() + + assert isinstance(dm.preprocessor, ColumnTransformer) + assert len(dm.preprocessor.transformers) == 0 + assert dm.preprocessor.remainder == 'passthrough' + + captured = capsys.readouterr() + assert "No active transformers configured for any columns. Creating a passthrough preprocessor." in captured.out + + +def test_setup_preprocessor_with_custom_settings(configured_data_module): + """ + Test setup_preprocessor with custom settings (StandardScaler, OneHotEncoder). + Should result in a ColumnTransformer. + """ + dm = configured_data_module + X, y = dm.load_and_prepare() + + assert isinstance(dm.preprocessor, ColumnTransformer) + + transformer_names = [name for name, _, _ in dm.preprocessor.transformers] + assert 'num_pipeline' in transformer_names + assert 'cat_pipeline' in transformer_names + + for name, pipeline, _ in dm.preprocessor.transformers: + if name == 'num_pipeline': + assert isinstance(pipeline, Pipeline) + assert any(isinstance(step[1], StandardScaler) for step in pipeline.steps) + if name == 'cat_pipeline': + assert isinstance(pipeline, Pipeline) + assert any(isinstance(step[1], OneHotEncoder) for step in pipeline.steps) + + +def test_create_and_fit_preprocessor(configured_data_module): + """Test creating and fitting the preprocessor.""" + dm = configured_data_module + X, _ = dm.load_and_prepare() + + fitted_preprocessor = dm.create_and_fit_preprocessor(X) + + assert isinstance(fitted_preprocessor, ColumnTransformer) + assert hasattr(fitted_preprocessor, 'transform') + +def test_transform_data(configured_data_module): + """Test data transformation.""" + dm = configured_data_module + X, y = dm.load_and_prepare() + dm.preprocessor = dm.create_and_fit_preprocessor(X) + + X_transformed = dm.transform_data(X) + + assert isinstance(X_transformed, np.ndarray) + assert X_transformed.shape[0] == X.shape[0] + + assert X_transformed.shape[1] == 5 + + +def test_perform_train_test_split_no_stratify(basic_data_module): + """Test non-stratified train-test split.""" + dm = basic_data_module + dm.stratify = False + X, y = dm.load_and_prepare() + + X_train, X_test, y_train, y_test = dm.perform_train_test_split(X, y) + + assert len(X_train) > 0 + assert len(X_test) > 0 + assert len(y_train) > 0 + assert len(y_test) > 0 + assert len(X_train) + len(X_test) == len(X) + assert len(y_train) + len(y_test) == len(y) + +def test_perform_train_test_split_stratified(configured_data_module): + """Test stratified train-test split.""" + dm = configured_data_module + X, y = dm.load_and_prepare() + + assert dm.stratify is True + X_train, X_test, y_train, y_test = dm.perform_train_test_split(X, y) + + assert len(X_train) > 0 + assert len(X_test) > 0 + assert len(y_train) > 0 + assert len(y_test) > 0 + assert len(X_train) + len(X_test) == len(X) + assert len(y_train) + len(y_test) == len(y) + + train_counts = y_train.value_counts(normalize=True) + test_counts = y_test.value_counts(normalize=True) + overall_counts = y.value_counts(normalize=True) + + pd.testing.assert_series_equal(train_counts, overall_counts, check_exact=False, rtol=0.1, atol=0.1, check_index=False) + pd.testing.assert_series_equal(test_counts, overall_counts, check_exact=False, rtol=0.1, atol=0.1, check_index=False) + + +def test_visualize_column_distributions(basic_data_module, capsys): + """Test column distribution visualization.""" + dm = basic_data_module + dm.visualise = True + dm.load_and_prepare() + dm.visualize_column_distributions() + captured = capsys.readouterr() + + assert "--- Generating Column Distribution Visualizations ---" in captured.out + assert "Column distribution visualizations complete." in captured.out + + assert "Displaying distributions for numerical columns..." in captured.out + assert "Displaying distributions for categorical columns..." in captured.out + + +def test_check_imbalance_runs(configured_data_module, capsys): + """Test that imbalance check runs and prints output when enabled.""" + dm = configured_data_module + dm.load_and_prepare() + captured = capsys.readouterr() + + assert "Checking target class imbalance for 'TargetColumn'..." in captured.out + assert "Class distribution for 'TargetColumn':" in captured.out + + +def test_get_feature_names_after_preprocessing_passthrough(basic_data_module): + """Test getting feature names when preprocessor is a passthrough ColumnTransformer.""" + dm = basic_data_module + X_raw, _ = dm.load_and_prepare() + + initial_feature_names = X_raw.columns.tolist() + + feature_names = dm.get_feature_names_after_preprocessing(dm.preprocessor, initial_feature_names) + + expected_names = ['Gender', 'Age', 'Height', 'Weight', 'Favorite Food'] + assert sorted(feature_names) == sorted(expected_names) + + +def test_get_feature_names_after_preprocessing_with_transformers(configured_data_module): + """Test getting feature names after preprocessing with actual transformers.""" + dm = configured_data_module + X_raw, _ = dm.load_and_prepare() + dm.preprocessor = dm.create_and_fit_preprocessor(X_raw) + + initial_feature_names = X_raw.columns.tolist() + + fitted_preprocessor = dm.create_and_fit_preprocessor(X_raw) + feature_names = dm.get_feature_names_after_preprocessing(fitted_preprocessor, initial_feature_names) + + expected_names_parts = [ + 'num_pipeline__Age', + 'num_pipeline__Height', + 'num_pipeline__Weight', + 'cat_pipeline__Gender_Female', + 'cat_pipeline__Gender_Male' + ] + + + for expected_part in expected_names_parts: + assert any(expected_part in name for name in feature_names), f"'{expected_part}' not found in feature names: {feature_names}" + + + assert len(feature_names) == 5 + +def test_empty_columns_to_drop(dummy_data_path): + dm = DataModule(data_path=dummy_data_path, target_column='TargetColumn', columns_to_drop=[]) + X, _ = dm.load_and_prepare() + assert 'Favorite Food' in X.columns + +def test_non_existent_columns_to_drop(dummy_data_path, capsys): + dm = DataModule(data_path=dummy_data_path, target_column='TargetColumn', columns_to_drop=['NonExistentColumn']) + X, _ = dm.load_and_prepare() + captured = capsys.readouterr() + assert "No specified columns were dropped (they might not exist)." in captured.out + assert 'NonExistentColumn' not in X.columns + +def test_no_numerical_or_categorical_features(tmp_path_factory): + + data_file = tmp_path_factory.mktemp("data_minimal") / "minimal_data.csv" + pd.DataFrame({'A': [1,2,3], 'Target': [0,1,0]}).to_csv(data_file, index=False) + dm = DataModule(data_path=str(data_file), target_column='Target') + X, y = dm.load_and_prepare() + assert 'A' in dm.numerical_features + assert not dm.categorical_features # No categorical features + assert isinstance(dm.preprocessor, ColumnTransformer) + assert len(dm.preprocessor.transformers) == 0 + + fitted_preprocessor = dm.create_and_fit_preprocessor(X) + X_transformed = dm.transform_data(X) + assert X_transformed.shape[1] == 1 + + data_file_empty = tmp_path_factory.mktemp("data_empty") / "empty_data.csv" + pd.DataFrame({'A': [1,2,3], 'B': ['x','y','z'], 'Target': [0,1,0]}).to_csv(data_file_empty, index=False) + + dm_empty_settings = DataModule( + data_path=str(data_file_empty), + target_column='Target', + preprocessor_settings={'numerical': {}, 'categorical': {}} + ) + X_empty, _ = dm_empty_settings.load_and_prepare() + + assert isinstance(dm_empty_settings.preprocessor, ColumnTransformer) + assert len(dm_empty_settings.preprocessor.transformers) == 0 + assert dm_empty_settings.preprocessor.remainder == 'passthrough' + + dm_empty_settings.create_and_fit_preprocessor(X_empty) + + transformed_empty = dm_empty_settings.transform_data(X_empty) + + assert transformed_empty.shape[1] == 2 + + + diff --git a/Structured_data_template/tests/test_parse_config.py b/Structured_data_template/tests/test_parse_config.py new file mode 100644 index 0000000..b73f3cd --- /dev/null +++ b/Structured_data_template/tests/test_parse_config.py @@ -0,0 +1,236 @@ +import pytest +import os +import configparser +import json # Used for comparison with json.loads output + + +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from project.utils.parse_config import ( + load_config, + get_model_and_hyperparams, + get_data_config, + get_grid_search_config, + get_grid_search_params, + get_logging_config, + get_training_config +) +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from xgboost import XGBClassifier, XGBRegressor +from sklearn.linear_model import LinearRegression + +# Define a temporary config file content for testing +TEMP_CONFIG_CONTENT = """ +[model] +model_name = xgboost +task = classification + +[hyperparameters.xgboost] +n_estimators = 100 +max_depth = 5 +learning_rate = 0.1 + +[data] +data_path = test_data.csv +target_column = TargetColumn +visualise_data = yes +check_imbalance = no +test_size = 0.25 +random_state = 42 +stratify = yes +columns_to_drop = colA, colB +categorical_columns_settings = {"Gender": {"imputer": "cat_imputer", "encoder": "onehot", "encoder_options": {"sparse_output": false}}} +numerical_columns_settings = {"Age": {"imputer": "num_imputer", "scaler": "standard"}} + +[training] +use_kfold_cv = yes +n_splits = 5 +shuffle_cv = yes +random_state_cv = 123 +use_optuna = no +n_trials = 50 +timeout = 300 +optuna_direction = maximize + +[logging] +save_model = yes +model_output_path = models/ +mlflow_tracking_uri = file:///tmp/mlruns + +[grid_params.xgboost] +params = {'n_estimators': [50, 100], 'max_depth': [3, 5]} +""" + +# Pytest fixture to create a temporary config file for each test +@pytest.fixture +def temp_config_file(tmp_path): + config_file = tmp_path / "test_config.cfg" + config_file.write_text(TEMP_CONFIG_CONTENT) + return str(config_file) + +# --- Tests for parse_config.py functions --- + +def test_load_config(temp_config_file): + config = load_config(temp_config_file) + assert isinstance(config, configparser.ConfigParser) + assert config['model']['model_name'] == 'xgboost' + assert config['data']['target_column'] == 'TargetColumn' + +def test_get_model_and_hyperparams(temp_config_file): + config = load_config(temp_config_file) + model_class, hyperparams = get_model_and_hyperparams(config) + + assert model_class == XGBClassifier + assert isinstance(hyperparams, dict) + assert hyperparams['n_estimators'] == '100' # Note: configparser reads as strings + assert hyperparams['max_depth'] == '5' + +def test_get_model_and_hyperparams_linear_regression(tmp_path): + # Test linear regression specific case + config_file = tmp_path / "test_lin_reg_config.cfg" + config_file.write_text(""" + [model] + model_name = linear_regression + task = regression + [hyperparameters.linear_regression] + fit_intercept = true + """) + config = load_config(str(config_file)) + model_class, hyperparams = get_model_and_hyperparams(config) + assert model_class == LinearRegression + assert hyperparams['fit_intercept'] == 'true' + +def test_get_model_and_hyperparams_unsupported_model(tmp_path): + config_file = tmp_path / "test_unsupported_config.cfg" + config_file.write_text(""" + [model] + model_name = unsupported_model + task = classification + """) + config = load_config(str(config_file)) + with pytest.raises(ValueError, match="Unsupported model name: unsupported_model"): + get_model_and_hyperparams(config) + +def test_get_model_and_hyperparams_linear_regression_classification_task(tmp_path): + config_file = tmp_path / "test_lin_reg_classification_config.cfg" + config_file.write_text(""" + [model] + model_name = linear_regression + task = classification + """) + config = load_config(str(config_file)) + with pytest.raises(ValueError, match="Linear Regression is only applicable for regression tasks."): + get_model_and_hyperparams(config) + + +def test_get_data_config(temp_config_file): + config = load_config(temp_config_file) + data_config = get_data_config(config) + + assert isinstance(data_config, dict) + assert data_config['data_path'] == 'test_data.csv' + assert data_config['target_column'] == 'TargetColumn' + assert data_config['visualise_data'] is True + assert data_config['check_imbalance'] is False + assert data_config['test_size'] == 0.25 + assert data_config['random_state'] == 42 + assert data_config['stratify'] is True + assert data_config['columns_to_drop'] == ['colA', 'colB'] + + # Test JSON parsing for dict settings + assert isinstance(data_config['categorical_columns_settings'], dict) + assert data_config['categorical_columns_settings'] == {'Gender': {'imputer': 'cat_imputer', 'encoder': 'onehot', 'encoder_options': {'sparse_output': False}}} + + assert isinstance(data_config['numerical_columns_settings'], dict) + assert data_config['numerical_columns_settings'] == {'Age': {'imputer': 'num_imputer', 'scaler': 'standard'}} + +def test_get_data_config_no_json_settings(tmp_path): + config_file = tmp_path / "test_no_json_config.cfg" + config_file.write_text(""" + [data] + data_path = path.csv + target_column = Y + visualise_data = no + check_imbalance = no + test_size = 0.2 + random_state = 0 + stratify = no + """) + config = load_config(str(config_file)) + data_config = get_data_config(config) + assert data_config['categorical_columns_settings'] == {} + assert data_config['numerical_columns_settings'] == {} + +def test_get_data_config_invalid_json(tmp_path): + config_file = tmp_path / "test_invalid_json_config.cfg" + config_file.write_text(""" + [data] + data_path = path.csv + target_column = Y + visualise_data = no + check_imbalance = no + test_size = 0.2 + random_state = 0 + stratify = no + categorical_columns_settings = {"Gender": "invalid_json + """) # Malformed JSON + config = load_config(str(config_file)) + with pytest.raises(ValueError, match="Error parsing categorical_columns_settings JSON"): + get_data_config(config) + +def test_get_grid_search_config(temp_config_file): + config = load_config(temp_config_file) + grid_search_config = get_grid_search_config(config) + + assert isinstance(grid_search_config, dict) + assert grid_search_config['use_optuna'] is False + assert grid_search_config['n_trials'] == 50 + assert grid_search_config['timeout'] == 300 + assert grid_search_config['optuna_direction'] == 'maximize' + assert grid_search_config['use_kfold_cv'] is True + assert grid_search_config['n_splits'] == 5 + assert grid_search_config['shuffle_cv'] is True + assert grid_search_config['random_state_cv'] == 123 + +def test_get_grid_search_params(temp_config_file): + config = load_config(temp_config_file) + params = get_grid_search_params(config, 'xgboost') + assert isinstance(params, dict) + assert params == {'n_estimators': [50, 100], 'max_depth': [3, 5]} + +def test_get_grid_search_params_no_section(temp_config_file): + config = load_config(temp_config_file) + params = get_grid_search_params(config, 'non_existent_model') + assert params == {} + +def test_get_grid_search_params_invalid_eval_string(tmp_path): + config_file = tmp_path / "test_invalid_eval_config.cfg" + config_file.write_text(""" + [grid_params.test_model] + params = {'n_estimators': [50, 100], 'max_depth': [3, 5], + """) # Malformed Python dict string + config = load_config(str(config_file)) + with pytest.raises(ValueError, match="Error parsing param_grid for test_model"): + get_grid_search_params(config, 'test_model') + + +def test_get_logging_config(temp_config_file): + config = load_config(temp_config_file) + logging_config = get_logging_config(config) + assert isinstance(logging_config, dict) + assert logging_config['save_model'] is True + assert logging_config['model_output_path'] == 'models/' + assert logging_config['mlflow_tracking_uri'] == 'file:///tmp/mlruns' + +def test_get_training_config(temp_config_file): + config = load_config(temp_config_file) + training_config = get_training_config(config) + assert isinstance(training_config, dict) + assert training_config['use_kfold_cv'] is True + assert training_config['n_splits'] == 5 + assert training_config['shuffle_cv'] is True + assert training_config['random_state_cv'] == 123 + assert training_config['use_optuna'] is False + assert training_config['n_trials'] == 50 + assert training_config['timeout'] == 300 + assert training_config['optuna_direction'] == 'maximize' \ No newline at end of file diff --git a/Structured_data_template/tests/unit_test.py b/Structured_data_template/tests/unit_test.py new file mode 100644 index 0000000..66702fa --- /dev/null +++ b/Structured_data_template/tests/unit_test.py @@ -0,0 +1,667 @@ +""" +Comprehensive unit tests for the ML pipeline components. +Tests cover DataLoader, DataModule, data utilities, and configuration parsing. +""" + +import unittest +import pandas as pd +import numpy as np +import tempfile +import json +import os +from pathlib import Path +from unittest.mock import patch, MagicMock +import configparser +import sys + +# Add parent directory to path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +# Import the modules to test +from train.src.DataLoader import DataLoader +from train.src.DataModule import DataModule +from train.src.utils.data_utils import FeatureEngineer, FeatureSelector, DataTransformer, DataProfiler +from train.src.utils.parse_config import load_config, get_model_and_hyperparams, get_data_config +from train.src.utils.visualise import DataVisualizer + + +class TestDataLoader(unittest.TestCase): + """Test suite for DataLoader functionality.""" + + def setUp(self): + """Set up test fixtures.""" + # Create sample data + self.sample_data = pd.DataFrame({ + 'numerical_col1': [1, 2, 3, 4, 5, 100], # Contains outlier + 'numerical_col2': [10.5, 20.1, 30.2, 40.3, 50.4, 60.5], + 'categorical_col': ['A', 'B', 'A', 'C', 'B', 'A'], + 'target': ['class1', 'class2', 'class1', 'class3', 'class2', 'class1'] + }) + + # Create temporary CSV file + self.temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) + self.sample_data.to_csv(self.temp_file.name, index=False) + self.temp_file.close() + + # Initialize DataLoader + self.data_loader = DataLoader( + data_path=self.temp_file.name, + target_column='target', + visualise=False, # Disable for testing + feature_engineering=False + ) + + def tearDown(self): + """Clean up test fixtures.""" + os.unlink(self.temp_file.name) + + def test_data_loading(self): + """Test basic data loading functionality.""" + self.data_loader._load_data() + + self.assertIsNotNone(self.data_loader.data) + self.assertEqual(self.data_loader.data.shape, (6, 4)) + self.assertTrue('target' in self.data_loader.data.columns) + + def test_column_type_inference(self): + """Test automatic column type inference.""" + self.data_loader._load_data() + X, y = self.data_loader._split_features_target() + self.data_loader._infer_column_types(X) + + expected_numerical = ['numerical_col1', 'numerical_col2'] + expected_categorical = ['categorical_col'] + + self.assertEqual(set(self.data_loader.numerical_features), set(expected_numerical)) + self.assertEqual(set(self.data_loader.categorical_features), set(expected_categorical)) + + def test_data_cleaning_removes_duplicates(self): + """Test that data cleaning removes duplicate rows.""" + # Add duplicate row + duplicate_data = pd.concat([self.sample_data, self.sample_data.iloc[[0]]], ignore_index=True) + duplicate_data.to_csv(self.temp_file.name, index=False) + + self.data_loader._load_data() + initial_shape = self.data_loader.data.shape + self.data_loader._clean_data() + final_shape = self.data_loader.data.shape + + self.assertLess(final_shape[0], initial_shape[0]) + + def test_columns_to_drop(self): + """Test column dropping functionality.""" + self.data_loader.columns_to_drop = ['numerical_col1'] + self.data_loader._load_data() + self.data_loader._drop_columns() + + self.assertNotIn('numerical_col1', self.data_loader.data.columns) + self.assertIn('numerical_col2', self.data_loader.data.columns) + + def test_target_validation(self): + """Test target column validation.""" + self.data_loader.target_column = 'nonexistent_column' + self.data_loader._load_data() + + with self.assertRaises(ValueError): + self.data_loader._validate_target_column() + + def test_complete_pipeline(self): + """Test the complete data loading and preparation pipeline.""" + X, y = self.data_loader.load_and_prepare_data() + + self.assertIsInstance(X, pd.DataFrame) + self.assertIsInstance(y, pd.Series) + self.assertEqual(len(X), len(y)) + self.assertNotIn(self.data_loader.target_column, X.columns) + + +class TestDataModule(unittest.TestCase): + """Test suite for DataModule functionality.""" + + def setUp(self): + """Set up test fixtures.""" + # Create sample data + self.sample_data = pd.DataFrame({ + 'num_col': [1, 2, 3, 4, 5], + 'cat_col': ['A', 'B', 'A', 'C', 'B'], + 'target': [0, 1, 0, 1, 0] + }) + + # Create temporary CSV file + self.temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) + self.sample_data.to_csv(self.temp_file.name, index=False) + self.temp_file.close() + + # Setup preprocessing settings + self.preprocessor_settings = { + 'numerical': { + 'num_col': {'imputer': 'mean', 'scaler': 'standard'} + }, + 'categorical': { + 'cat_col': {'imputer': 'mode', 'encoder': 'onehot'} + } + } + + self.data_module = DataModule( + data_path=self.temp_file.name, + target_column='target', + preprocessor_settings=self.preprocessor_settings, + visualise=False + ) + + def tearDown(self): + """Clean up test fixtures.""" + os.unlink(self.temp_file.name) + + def test_load_and_prepare(self): + """Test the complete load and prepare pipeline.""" + X, y = self.data_module.load_and_prepare() + + self.assertIsInstance(X, pd.DataFrame) + self.assertIsInstance(y, pd.Series) + self.assertIsNotNone(self.data_module.preprocessor) + + def test_preprocessor_creation(self): + """Test preprocessing pipeline creation.""" + X, y = self.data_module.load_and_prepare() + + # Check that preprocessor was created + self.assertIsNotNone(self.data_module.preprocessor) + + # Check that it has the expected transformers + transformer_names = [name for name, _, _ in self.data_module.preprocessor.transformers_] + self.assertIn('num', transformer_names) + self.assertIn('cat', transformer_names) + + def test_train_test_split(self): + """Test train-test splitting functionality.""" + X, y = self.data_module.load_and_prepare() + X_train, X_test, y_train, y_test = self.data_module.perform_train_test_split(X, y) + + # Check shapes + self.assertEqual(len(X_train) + len(X_test), len(X)) + self.assertEqual(len(y_train) + len(y_test), len(y)) + + # Check that split maintains data integrity + self.assertEqual(X_train.shape[1], X_test.shape[1]) + + def test_preprocessor_fit_transform(self): + """Test preprocessor fitting and transformation.""" + X, y = self.data_module.load_and_prepare() + + # Fit preprocessor + fitted_preprocessor = self.data_module.create_and_fit_preprocessor(X) + + # Transform data + X_transformed = self.data_module.transform_data(X) + + self.assertIsInstance(X_transformed, np.ndarray) + self.assertEqual(X_transformed.shape[0], len(X)) + + +class TestFeatureEngineer(unittest.TestCase): + """Test suite for FeatureEngineer utility class.""" + + def setUp(self): + """Set up test fixtures.""" + self.feature_engineer = FeatureEngineer(random_state=42) + self.sample_data = pd.DataFrame({ + 'num1': [1, 2, 3, 4, 5], + 'num2': [10, 20, 30, 40, 50], + 'cat1': ['A', 'B', 'A', 'C', 'B'] + }) + + def test_interaction_features(self): + """Test interaction feature creation.""" + result = self.feature_engineer.create_interaction_features( + self.sample_data, + ['num1', 'num2'], + max_combinations=5 + ) + + # Check that interaction feature was created + interaction_cols = [col for col in result.columns if '_x_' in col] + self.assertTrue(len(interaction_cols) > 0) + self.assertIn('num1_x_num2', result.columns) + + # Check interaction values are correct + expected_interaction = self.sample_data['num1'] * self.sample_data['num2'] + pd.testing.assert_series_equal( + result['num1_x_num2'], + expected_interaction, + check_names=False + ) + + def test_polynomial_features(self): + """Test polynomial feature creation.""" + result = self.feature_engineer.create_polynomial_features( + self.sample_data, + ['num1'], + degree=2 + ) + + # Check that polynomial features were created + poly_cols = [col for col in result.columns if 'num1^2' in col or 'num1 num1' in col] + self.assertTrue(len(poly_cols) > 0) + + def test_binning_features(self): + """Test binning feature creation.""" + result = self.feature_engineer.create_binning_features( + self.sample_data, + 'num1', + n_bins=3 + ) + + # Check that binning features were created + bin_cols = [col for col in result.columns if 'bin' in col] + self.assertTrue(len(bin_cols) > 0) + + def test_feature_creation_log(self): + """Test that feature creation is properly logged.""" + initial_log_length = len(self.feature_engineer.feature_creation_log) + + self.feature_engineer.create_interaction_features( + self.sample_data, + ['num1', 'num2'] + ) + + final_log_length = len(self.feature_engineer.feature_creation_log) + self.assertGreater(final_log_length, initial_log_length) + + +class TestDataTransformer(unittest.TestCase): + """Test suite for DataTransformer utility class.""" + + def setUp(self): + """Set up test fixtures.""" + self.data_transformer = DataTransformer(random_state=42) + self.sample_data = pd.DataFrame({ + 'normal_col': [1, 2, 3, 4, 5], + 'outlier_col': [1, 2, 3, 4, 100], # Contains outlier + 'skewed_col': [1, 2, 4, 8, 16] # Positively skewed + }) + + def test_outlier_handling_iqr(self): + """Test IQR-based outlier handling.""" + result = self.data_transformer.handle_outliers( + self.sample_data, + ['outlier_col'], + method='iqr' + ) + + # The outlier (100) should be clipped + self.assertLess(result['outlier_col'].max(), 100) + self.assertEqual(len(result), len(self.sample_data)) + + def test_log_transformation(self): + """Test log transformation.""" + result = self.data_transformer.apply_log_transform( + self.sample_data, + ['skewed_col'] + ) + + # Check that log column was created + self.assertIn('skewed_col_log', result.columns) + + # Check log values are correct + expected_log = np.log(self.sample_data['skewed_col']) + pd.testing.assert_series_equal( + result['skewed_col_log'], + expected_log, + check_names=False + ) + + def test_log_transformation_non_positive(self): + """Test log transformation with non-positive values.""" + data_with_zeros = self.sample_data.copy() + data_with_zeros.loc[0, 'skewed_col'] = 0 + + result = self.data_transformer.apply_log_transform( + data_with_zeros, + ['skewed_col'] + ) + + # Should use log1p for non-positive values + self.assertIn('skewed_col_log', result.columns) + + +class TestDataProfiler(unittest.TestCase): + """Test suite for DataProfiler utility class.""" + + def setUp(self): + """Set up test fixtures.""" + self.data_profiler = DataProfiler() + self.sample_data = pd.DataFrame({ + 'numerical': [1, 2, 3, 4, 5, np.nan], + 'categorical': ['A', 'B', 'A', 'C', 'B', 'A'], + 'target': [0, 1, 0, 1, 0, 1] + }) + + def test_data_profile_generation(self): + """Test comprehensive data profile generation.""" + profile = self.data_profiler.generate_data_profile( + self.sample_data, + target_col='target' + ) + + # Check that all major sections are present + expected_sections = ['overview', 'columns', 'missing_data', 'data_quality'] + for section in expected_sections: + self.assertIn(section, profile) + + # Check overview section + self.assertEqual(profile['overview']['shape'], (6, 3)) + self.assertIn('numerical_columns', profile['overview']) + self.assertIn('categorical_columns', profile['overview']) + + def test_column_profiling(self): + """Test individual column profiling.""" + profile = self.data_profiler.generate_data_profile(self.sample_data) + + # Check numerical column profile + num_profile = profile['columns']['numerical'] + self.assertEqual(num_profile['missing_count'], 1) + self.assertIn('mean', num_profile) + self.assertIn('std', num_profile) + + # Check categorical column profile + cat_profile = profile['columns']['categorical'] + self.assertEqual(cat_profile['missing_count'], 0) + self.assertIn('most_frequent', cat_profile) + self.assertIn('unique_count', cat_profile) + + def test_missing_data_analysis(self): + """Test missing data analysis.""" + profile = self.data_profiler.generate_data_profile(self.sample_data) + + missing_data = profile['missing_data'] + self.assertEqual(missing_data['total_missing_cells'], 1) + self.assertIn('numerical', missing_data['columns_with_missing']) + + def test_json_serialization(self): + """Test that profile can be serialized to JSON.""" + profile = self.data_profiler.generate_data_profile(self.sample_data) + + # This should not raise an exception + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file: + try: + output_path = self.data_profiler.save_profile_report(profile, temp_file.name) + self.assertTrue(os.path.exists(output_path)) + + # Verify the file contains valid JSON + with open(output_path, 'r') as f: + loaded_profile = json.load(f) + self.assertIsInstance(loaded_profile, dict) + finally: + if os.path.exists(temp_file.name): + os.unlink(temp_file.name) + + +class TestConfigParsing(unittest.TestCase): + """Test suite for configuration parsing functionality.""" + + def setUp(self): + """Set up test fixtures.""" + self.config_content = """ +[project] +name = test_project + +[data] +data_path = /path/to/data.csv +target_column = target +visualise_data = true +feature_engineering = false +test_size = 0.2 +random_state = 42 + +[model] +model_name = logistic_regression +task = classification + +[hyperparameters.logistic_regression] +C = 1.0 +penalty = l2 +max_iter = 1000 + +[preprocessing.numerical] +feature1.imputer = mean +feature1.scaler = standard + +[preprocessing.categorical] +feature2.encoder = onehot +feature2.encoder_options = {"sparse_output": false} +""" + + # Create temporary config file + self.temp_config = tempfile.NamedTemporaryFile(mode='w', suffix='.cfg', delete=False) + self.temp_config.write(self.config_content) + self.temp_config.close() + + def tearDown(self): + """Clean up test fixtures.""" + os.unlink(self.temp_config.name) + + def test_config_loading(self): + """Test basic configuration loading.""" + config = load_config(self.temp_config.name) + + self.assertIsInstance(config, configparser.ConfigParser) + self.assertEqual(config['project']['name'], 'test_project') + self.assertEqual(config['data']['target_column'], 'target') + + def test_model_and_hyperparams_extraction(self): + """Test model and hyperparameter extraction.""" + config = load_config(self.temp_config.name) + model_class, hyperparams = get_model_and_hyperparams(config) + + # Check that we get the right model class + from sklearn.linear_model import LogisticRegression + self.assertEqual(model_class, LogisticRegression) + + # Check hyperparameters + self.assertIn('C', hyperparams) + self.assertEqual(hyperparams['C'], 1.0) + self.assertEqual(hyperparams['penalty'], 'l2') + + def test_data_config_extraction(self): + """Test data configuration extraction.""" + config = load_config(self.temp_config.name) + data_config = get_data_config(config) + + # Check basic data settings + self.assertEqual(data_config['data_path'], '/path/to/data.csv') + self.assertEqual(data_config['target_column'], 'target') + self.assertTrue(data_config['visualise_data']) + self.assertFalse(data_config['feature_engineering']) + + # Check preprocessing settings + self.assertIn('preprocessor_settings', data_config) + preprocessing = data_config['preprocessor_settings'] + + self.assertIn('numerical', preprocessing) + self.assertIn('categorical', preprocessing) + + # Check specific preprocessing settings + self.assertEqual(preprocessing['numerical']['feature1']['imputer'], 'mean') + self.assertEqual(preprocessing['categorical']['feature2']['encoder'], 'onehot') + + +class TestVisualizerRobustness(unittest.TestCase): + """Test suite for visualizer error handling and robustness.""" + + def setUp(self): + """Set up test fixtures.""" + self.visualizer = DataVisualizer(output_dir="test_plots") + self.sample_data = pd.DataFrame({ + 'numerical': [1, 2, 3, 4, 5], + 'categorical': ['A', 'B', 'A', 'C', 'B'], + 'mixed_target': ['class1', 'class2', 'class1', 'class2', 'class1'] + }) + + def tearDown(self): + """Clean up test plots directory.""" + import shutil + if os.path.exists("test_plots"): + shutil.rmtree("test_plots") + + @patch('matplotlib.pyplot.show') + @patch('matplotlib.pyplot.savefig') + def test_categorical_target_correlation_handling(self, mock_savefig, mock_show): + """Test that categorical targets don't break correlation analysis.""" + # This should not raise an exception + self.visualizer._plot_target_distribution( + self.sample_data, + 'mixed_target', + 'test_target_plot' + ) + + # Verify that savefig was called (plot was created) + mock_savefig.assert_called() + + @patch('matplotlib.pyplot.show') + @patch('matplotlib.pyplot.savefig') + def test_empty_data_handling(self, mock_savefig, mock_show): + """Test handling of edge cases like empty data.""" + empty_data = pd.DataFrame() + + # Should handle empty data gracefully + try: + self.visualizer.plot_distributions(empty_data) + except Exception as e: + # Should not crash with unhandled exceptions + self.fail(f"Visualizer should handle empty data gracefully, but raised: {e}") + + +class TestIntegration(unittest.TestCase): + """Integration tests for the complete pipeline.""" + + def setUp(self): + """Set up integration test fixtures.""" + # Create a more comprehensive dataset + np.random.seed(42) + self.sample_data = pd.DataFrame({ + 'age': np.random.randint(18, 80, 100), + 'income': np.random.normal(50000, 15000, 100), + 'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 100), + 'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston'], 100), + 'target': np.random.choice(['A', 'B', 'C'], 100) + }) + + # Create temporary CSV + self.temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) + self.sample_data.to_csv(self.temp_file.name, index=False) + self.temp_file.close() + + # Create config + self.config_content = f""" +[data] +data_path = {self.temp_file.name} +target_column = target +visualise_data = false +feature_engineering = false +test_size = 0.2 +random_state = 42 + +[preprocessing.numerical] +age.imputer = median +age.scaler = standard +income.imputer = mean +income.scaler = standard + +[preprocessing.categorical] +education.encoder = onehot +city.encoder = onehot +""" + + self.temp_config = tempfile.NamedTemporaryFile(mode='w', suffix='.cfg', delete=False) + self.temp_config.write(self.config_content) + self.temp_config.close() + + def tearDown(self): + """Clean up integration test fixtures.""" + os.unlink(self.temp_file.name) + os.unlink(self.temp_config.name) + + def test_end_to_end_pipeline(self): + """Test the complete end-to-end pipeline.""" + # Load configuration + config = load_config(self.temp_config.name) + data_config = get_data_config(config) + + # Initialize DataModule + data_module = DataModule( + data_path=data_config['data_path'], + target_column=data_config['target_column'], + preprocessor_settings=data_config['preprocessor_settings'], + test_size=data_config['test_size'], + random_state=data_config['random_state'], + visualise=False + ) + + # Load and prepare data + X, y = data_module.load_and_prepare() + + # Perform train-test split + X_train, X_test, y_train, y_test = data_module.perform_train_test_split(X, y) + + # Fit preprocessor and transform data + fitted_preprocessor = data_module.create_and_fit_preprocessor(X_train) + X_train_transformed = data_module.transform_data(X_train) + X_test_transformed = data_module.transform_data(X_test) + + # Verify final results + self.assertIsInstance(X_train_transformed, np.ndarray) + self.assertIsInstance(X_test_transformed, np.ndarray) + self.assertEqual(X_train_transformed.shape[1], X_test_transformed.shape[1]) + self.assertGreater(X_train_transformed.shape[1], 2) # Should have expanded due to encoding + + +def run_all_tests(): + """Run all test suites.""" + # Create test suite + test_classes = [ + TestDataLoader, + TestDataModule, + TestFeatureEngineer, + TestDataTransformer, + TestDataProfiler, + TestConfigParsing, + TestVisualizerRobustness, + TestIntegration + ] + + suite = unittest.TestSuite() + + for test_class in test_classes: + tests = unittest.TestLoader().loadTestsFromTestCase(test_class) + suite.addTests(tests) + + # Run tests + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite) + + return result + + +if __name__ == "__main__": + # Run specific test class or all tests + import sys + + if len(sys.argv) > 1: + # Run specific test class + test_class_name = sys.argv[1] + if test_class_name in globals(): + unittest.main(argv=[''], test_class=globals()[test_class_name], verbosity=2) + else: + print(f"Test class {test_class_name} not found") + else: + # Run all tests + result = run_all_tests() + + # Print summary + print(f"\n{'='*50}") + print(f"TESTS RUN: {result.testsRun}") + print(f"FAILURES: {len(result.failures)}") + print(f"ERRORS: {len(result.errors)}") + print(f"SUCCESS RATE: {((result.testsRun - len(result.failures) - len(result.errors)) / result.testsRun * 100):.1f}%") + print(f"{'='*50}") \ No newline at end of file diff --git a/Structured_data_template/train/Dockerfile b/Structured_data_template/train/Dockerfile new file mode 100644 index 0000000..8887b7e --- /dev/null +++ b/Structured_data_template/train/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.10 + +WORKDIR /project + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential git rsync software-properties-common ffmpeg libsm6 libxext6 && \ + rm -rf /var/lib/apt/lists/* + +ENV PYTHONPATH="/mlflow/projects/code/:$PYTHONPATH" + +COPY . . + +RUN python -m pip install --upgrade pip && \ + python -m pip install --no-cache-dir -r requirements.txt diff --git a/Structured_data_template/train/__init__.py b/Structured_data_template/train/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Structured_data_template/train/config/config.cfg b/Structured_data_template/train/config/config.cfg new file mode 100644 index 0000000..953f398 --- /dev/null +++ b/Structured_data_template/train/config/config.cfg @@ -0,0 +1,63 @@ +[project] +name = structured-data-ml + +[system] +cuda_visible_devices = 0 + +[data] +data_path = data/ObesityDataSet_raw_and_data_sinthetic.csv +target_column = NObeyesdad +visualise_data = False +check_imbalance = False +test_size = 0.2 +random_state = 42 +stratify = False + + +categorical_columns_settings = {"Gender": {"imputer": "cat_imputer", "encoder": "onehot", "encoder_options": {"sparse_output": false}}} +#, "FAVC": {"method": "onehot"}, "CAEC": {"method": "ordinal", "options": {"categories": [["no", "Sometimes", "Frequently", "Always"]]}}} + + +numerical_columns_settings = {"Weight": {"imputer": "num_imputer", "scaler": "standard"}, "Height": {"imputer": "num_imputer", "scaler": "standard"}} +columns_to_drop = family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS + + +[model] +model_name = xgboost +# Options: random_forest, xgboost, linear_regression +task = classification +# Options: classification, regression + +[hyperparameters.random_forest] +n_estimators = 100 +max_depth = 10 +min_samples_split = 2 +min_samples_leaf = 1 + +[hyperparameters.xgboost] +n_estimators = 100 +max_depth = 6 +learning_rate = 0.1 +subsample = 0.8 +colsample_bytree = 0.8 +objective = multi:softprob +eval_metric = mlogloss + +[hyperparameters.linear_regression] +fit_intercept = True +normalize = False + +[logging] +save_model = True +model_output_path = ./models/ +mlflow_tracking_uri = http://localhost:5001 + +[training] +use_kfold_cv = True +n_splits = 5 +shuffle_cv = True +random_state_cv = 42 +use_optuna = True +n_trials = 10 +timeout = 600 +optuna_direction = maximize diff --git a/Structured_data_template/train/config/local_config.cfg b/Structured_data_template/train/config/local_config.cfg new file mode 100644 index 0000000..16a5e9b --- /dev/null +++ b/Structured_data_template/train/config/local_config.cfg @@ -0,0 +1,72 @@ +[project] +name = structured-data-ml + +[system] +cuda_visible_devices = 0 + +[data] +data_path = /Users/ksonar/Documents/Technical/project-template/data/heart.csv +target_column = target +visualise_data = true +check_imbalance = true +feature_engineering = false +test_size = 0.2 +random_state = 42 +stratify = False +columns_to_drop = + +# Preprocessing settings for categorical and numerical columns +[preprocessing.categorical] + + +[preprocessing.numerical] +age.imputer = median +age.scaler = standard +chol.imputer = median +chol.scaler = standard + +[model] +model_name = xgboost +task = classification + +[hyperparameters.random_forest] +n_estimators = 100 +max_depth = 10 +min_samples_split = 2 +min_samples_leaf = 1 +random_state = 42 + +[hyperparameters.xgboost] +n_estimators = 100 +max_depth = 6 +learning_rate = 0.1 +subsample = 0.8 +colsample_bytree = 0.8 +objective = multi:softprob +eval_metric = mlogloss +random_state = 42 + +[hyperparameters.linear_regression] +fit_intercept = true + +[hyperparameters.logistic_regression] +C = 1.0 +penalty = l2 +solver = lbfgs +max_iter = 1000 +random_state = 42 + +[logging] +save_model = true +model_output_path = ./models/ +mlflow_tracking_uri = http://localhost:5001 + +[training] +use_kfold_cv = true +n_splits = 5 +shuffle_cv = true +random_state_cv = 42 +use_optuna = true +n_trials = 10 +timeout = 600 +optuna_direction = maximize \ No newline at end of file diff --git a/Structured_data_template/train/plots/categorical_distributions.png b/Structured_data_template/train/plots/categorical_distributions.png new file mode 100644 index 0000000..88f930a Binary files /dev/null and b/Structured_data_template/train/plots/categorical_distributions.png differ diff --git a/Structured_data_template/train/plots/numerical_distributions.png b/Structured_data_template/train/plots/numerical_distributions.png new file mode 100644 index 0000000..4634c7a Binary files /dev/null and b/Structured_data_template/train/plots/numerical_distributions.png differ diff --git a/Structured_data_template/train/plots/target_distribution.png b/Structured_data_template/train/plots/target_distribution.png new file mode 100644 index 0000000..87d0309 Binary files /dev/null and b/Structured_data_template/train/plots/target_distribution.png differ diff --git a/Structured_data_template/train/scripts/__init__.py b/Structured_data_template/train/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Structured_data_template/train/scripts/train.py b/Structured_data_template/train/scripts/train.py new file mode 100644 index 0000000..85365ca --- /dev/null +++ b/Structured_data_template/train/scripts/train.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 +""" +Training script for structured data ML models. +Supports various models, cross-validation, and MLflow logging. +Updated to work with new DataLoader and DataModule structure. +""" + +import sys +import os +from pathlib import Path + +# Add parent directory to path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +import json +import joblib +import numpy as np +import pandas as pd +from typing import Dict, Any, Tuple, Optional + +import mlflow +from sklearn.metrics import ( + classification_report, accuracy_score, f1_score, precision_score, recall_score, + mean_squared_error, mean_absolute_error, r2_score +) +from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score +from sklearn.preprocessing import LabelEncoder + +from src.utils.parse_config import ( + load_config, get_model_and_hyperparams, get_data_config, + get_logging_config, get_training_config, validate_config +) +from src.DataModule import DataModule + + +class ModelTrainer: + """Handles model training, evaluation, and logging with new architecture.""" + + def __init__(self, config_path: str): + """Initialize trainer with configuration.""" + self.config_path = config_path + self.config = load_config(config_path) + + # Validate configuration + validate_config(self.config) + + # Extract configurations + self.model_class, self.initial_hyperparams = get_model_and_hyperparams(self.config) + self.model_name = self.config["model"]["model_name"] + self.task = self.config["model"]["task"] + self.data_config = get_data_config(self.config) + self.logging_config = get_logging_config(self.config) + self.training_config = get_training_config(self.config) + + # Initialize components + self.data_module = None + self.label_encoder = None + self.X = None + self.y = None + self.final_model = None + + print(f"Initializing {self.model_name} trainer for {self.task} task") + + def setup_mlflow(self) -> None: + """Setup MLflow tracking.""" + mlflow.set_tracking_uri(self.logging_config['mlflow_tracking_uri']) + experiment_name = f"{self.model_name}_{self.task}_experiment" + mlflow.set_experiment(experiment_name) + print(f"MLflow experiment: {experiment_name}") + + def load_data(self) -> None: + """Load and prepare data using the new DataLoader + DataModule architecture.""" + print("\n" + "="*60) + print("LOADING AND PREPARING DATA") + print("="*60) + + # Initialize DataModule with enhanced settings + self.data_module = DataModule( + data_path=self.data_config['data_path'], + target_column=self.data_config['target_column'], + columns_to_drop=self.data_config.get('columns_to_drop', []), + preprocessor_settings=self.data_config.get('preprocessor_settings', {}), + visualise=self.data_config.get('visualise_data'), + check_imbalance=self.data_config.get('check_imbalance', False), + test_size=self.data_config.get('test_size', 0.2), + stratify=self.data_config.get('stratify', False), + random_state=self.data_config.get('random_state', 42), + feature_engineering=self.data_config.get('feature_engineering', False) + ) + + # Load and prepare data (DataLoader handles the heavy lifting) + self.X, self.y = self.data_module.load_and_prepare() + + # Encode target for classification + if self.task == "classification": + self.label_encoder = LabelEncoder() + self.y = pd.Series( + self.label_encoder.fit_transform(self.y), + index=self.y.index, + name=self.y.name + ) + print(f"Target encoded. Classes: {list(self.label_encoder.classes_)}") + + # Print data summary + summary = self.data_module.get_data_summary() + print(f"\nFINAL DATA SUMMARY:") + print(f" Features shape: {self.X.shape}") + print(f" Target shape: {self.y.shape}") + print(f" Numerical features: {len(self.data_module.numerical_features)}") + print(f" Categorical features: {len(self.data_module.categorical_features)}") + + def load_hyperparameters(self) -> Dict[str, Any]: + """Load hyperparameters from Optuna or use default.""" + if self.training_config.get('use_optuna', False): + best_params_path = Path(f"models/best_params_{self.model_name}.json") + + if best_params_path.exists(): + with open(best_params_path, 'r') as f: + hyperparams = json.load(f) + print(f"Loaded optimized hyperparameters from {best_params_path}") + return hyperparams + else: + print(f"Optuna results not found at {best_params_path}") + print("Using default hyperparameters. Run tune.py first for optimal results.") + + return self.initial_hyperparams + + def perform_cross_validation(self, hyperparams: Dict[str, Any]) -> Dict[str, float]: + """Perform cross-validation and return metrics.""" + print(f"\nCROSS-VALIDATION ({self.training_config['n_splits']}-fold)") + print("-" * 50) + + # Setup cross-validation + if self.task == "classification" and self.data_config.get('stratify', False): + cv = StratifiedKFold( + n_splits=self.training_config['n_splits'], + shuffle=self.training_config['shuffle_cv'], + random_state=self.training_config['random_state_cv'] + ) + print("Using stratified K-fold cross-validation") + else: + cv = KFold( + n_splits=self.training_config['n_splits'], + shuffle=self.training_config['shuffle_cv'], + random_state=self.training_config['random_state_cv'] + ) + print("Using standard K-fold cross-validation") + + # Fit preprocessor and transform data + fitted_preprocessor = self.data_module.create_and_fit_preprocessor(self.X) + X_processed = fitted_preprocessor.transform(self.X) + + # Create model + model = self.model_class(**hyperparams) + print(f"Created {self.model_name} model with {len(hyperparams)} hyperparameters") + + # Perform cross-validation + if self.task == "classification": + cv_scores = cross_val_score(model, X_processed, self.y, cv=cv, scoring='accuracy', n_jobs=-1) + metric_name = 'accuracy' + else: + cv_scores = cross_val_score(model, X_processed, self.y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1) + cv_scores = np.sqrt(-cv_scores) + metric_name = 'rmse' + + cv_results = { + f'cv_mean_{metric_name}': cv_scores.mean(), + f'cv_std_{metric_name}': cv_scores.std(), + 'cv_scores': cv_scores.tolist() + } + + print(f"CV {metric_name.upper()}: {cv_results[f'cv_mean_{metric_name}']:.4f} ± {cv_results[f'cv_std_{metric_name}']:.4f}") + print(f" Individual fold scores: {[f'{score:.4f}' for score in cv_scores]}") + + return cv_results + + def train_final_model(self, hyperparams: Dict[str, Any]) -> None: + """Train final model on all data.""" + print(f"\nTRAINING FINAL MODEL") + print("-" * 30) + + # Fit preprocessor and transform data + fitted_preprocessor = self.data_module.create_and_fit_preprocessor(self.X) + X_processed = fitted_preprocessor.transform(self.X) + + # Train final model + self.final_model = self.model_class(**hyperparams) + print(f"Training {self.model_name} on {X_processed.shape[0]} samples with {X_processed.shape[1]} features") + + self.final_model.fit(X_processed, self.y) + + print("Final model training completed") + + # Store preprocessor with model for future use + self.final_model._preprocessor = fitted_preprocessor + self.final_model._feature_names = self.X.columns.tolist() + + def evaluate_holdout(self, hyperparams: Dict[str, Any]) -> Dict[str, float]: + """Evaluate model on holdout test set.""" + print(f"\nHOLDOUT EVALUATION") + print("-" * 25) + + # Split data + X_train, X_test, y_train, y_test = self.data_module.perform_train_test_split(self.X, self.y) + + # Fit preprocessor on training data only + fitted_preprocessor = self.data_module.create_and_fit_preprocessor(X_train) + + # Transform data + X_train_processed = fitted_preprocessor.transform(X_train) + X_test_processed = fitted_preprocessor.transform(X_test) + + # Train model + model = self.model_class(**hyperparams) + model.fit(X_train_processed, y_train) + + # Make predictions + y_pred = model.predict(X_test_processed) + + # Calculate metrics + if self.task == "classification": + metrics = { + 'holdout_accuracy': accuracy_score(y_test, y_pred), + 'holdout_f1_weighted': f1_score(y_test, y_pred, average='weighted'), + 'holdout_precision_weighted': precision_score(y_test, y_pred, average='weighted'), + 'holdout_recall_weighted': recall_score(y_test, y_pred, average='weighted') + } + print(f"Holdout Results:") + print(f" Accuracy: {metrics['holdout_accuracy']:.4f}") + print(f" F1 (weighted): {metrics['holdout_f1_weighted']:.4f}") + print(f" Precision (weighted): {metrics['holdout_precision_weighted']:.4f}") + print(f" Recall (weighted): {metrics['holdout_recall_weighted']:.4f}") + else: + mse = mean_squared_error(y_test, y_pred) + metrics = { + 'holdout_rmse': np.sqrt(mse), + 'holdout_mae': mean_absolute_error(y_test, y_pred), + 'holdout_r2': r2_score(y_test, y_pred) + } + print(f"Holdout Results:") + print(f" RMSE: {metrics['holdout_rmse']:.4f}") + print(f" MAE: {metrics['holdout_mae']:.4f}") + print(f" R²: {metrics['holdout_r2']:.4f}") + + return metrics + + def save_model(self) -> Optional[str]: + """Save the trained model with all components.""" + if not self.logging_config.get('save_model', False) or self.final_model is None: + return None + + print(f"\nSAVING MODEL") + print("-" * 20) + + # Create output directory + output_dir = Path(self.logging_config['model_output_path']) + output_dir.mkdir(parents=True, exist_ok=True) + + # Save model with comprehensive package + model_filename = f"{self.model_name}_{self.task}_model.pkl" + model_path = output_dir / model_filename + + # Create comprehensive model package + model_package = { + 'model': self.final_model, + 'preprocessor': getattr(self.final_model, '_preprocessor', None), + 'label_encoder': self.label_encoder, + 'feature_names': getattr(self.final_model, '_feature_names', self.X.columns.tolist()), + 'numerical_features': self.data_module.numerical_features, + 'categorical_features': self.data_module.categorical_features, + 'model_name': self.model_name, + 'task': self.task, + 'target_column': self.data_config['target_column'], + 'model_config': { + 'hyperparameters': self.initial_hyperparams, + 'data_config': self.data_config, + 'training_config': self.training_config + }, + 'data_summary': self.data_module.get_data_summary() + } + + joblib.dump(model_package, model_path) + print(f"Model package saved to: {model_path}") + print(f" Package includes: model, preprocessor, encoders, metadata") + + return str(model_path) + + def run(self) -> None: + """Run the complete training pipeline.""" + print("\n" + "="*80) + print("STARTING ML MODEL TRAINING PIPELINE") + print("="*80) + + try: + # Setup MLflow + self.setup_mlflow() + + with mlflow.start_run(run_name=f"{self.model_name}_{self.task}_training"): + # Load data + self.load_data() + + # Load hyperparameters + hyperparams = self.load_hyperparameters() + print(f"\nUsing hyperparameters: {hyperparams}") + + # Log parameters + mlflow.log_params({ + 'model_name': self.model_name, + 'task': self.task, + **hyperparams, + **{k: v for k, v in self.data_config.items() if not isinstance(v, dict)}, + **self.training_config + }) + + # Log additional info + if self.label_encoder is not None: + mlflow.log_param('target_classes', list(self.label_encoder.classes_)) + + mlflow.log_params({ + 'final_features_count': self.X.shape[1], + 'samples_count': self.X.shape[0], + 'numerical_features_count': len(self.data_module.numerical_features), + 'categorical_features_count': len(self.data_module.categorical_features) + }) + + all_metrics = {} + + # Cross-validation or holdout evaluation + if self.training_config.get('use_kfold_cv', True): + cv_metrics = self.perform_cross_validation(hyperparams) + all_metrics.update(cv_metrics) + + # Train final model on all data + self.train_final_model(hyperparams) + else: + # Use holdout validation + holdout_metrics = self.evaluate_holdout(hyperparams) + all_metrics.update(holdout_metrics) + + # Train final model on all data + self.train_final_model(hyperparams) + + # Log metrics + for metric_name, metric_value in all_metrics.items(): + if isinstance(metric_value, (int, float)): + mlflow.log_metric(metric_name, metric_value) + + # Save model + model_path = self.save_model() + if model_path: + mlflow.log_artifact(model_path) + + print(f"\n" + "="*80) + print("TRAINING COMPLETED SUCCESSFULLY!") + print("="*80) + print(f"MLflow run ID: {mlflow.active_run().info.run_id}") + if model_path: + print(f"Model saved to: {model_path}") + print("="*80) + + except Exception as e: + print(f"\nTRAINING FAILED: {str(e)}") + import traceback + traceback.print_exc() + raise + finally: + mlflow.end_run() + + +def main(): + """Main training function.""" + import argparse + + parser = argparse.ArgumentParser(description='Train ML model on structured data') + parser.add_argument( + '--config', + default='/Users/ksonar/Documents/Technical/project-template/Structured_data_template/train/config/local_config.cfg', + help='Path to configuration file' + ) + + args = parser.parse_args() + + # Handle relative paths + config_path = Path(args.config) + if not config_path.is_absolute(): + config_path = Path.cwd() / config_path + + if not config_path.exists(): + print(f"Configuration file not found: {config_path}") + sys.exit(1) + + # Create and run trainer + trainer = ModelTrainer(str(config_path)) + trainer.run() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Structured_data_template/train/scripts/tune.py b/Structured_data_template/train/scripts/tune.py new file mode 100644 index 0000000..ad16c4f --- /dev/null +++ b/Structured_data_template/train/scripts/tune.py @@ -0,0 +1,414 @@ +""" +Hyperparameter tuning script using Optuna for structured data ML models. +Supports various models and optimization strategies. +""" + +import sys +import os +from pathlib import Path + +# Add parent directory to path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +import json +import numpy as np +import pandas as pd +from typing import Dict, Any, Callable + +import mlflow +import optuna +from sklearn.metrics import accuracy_score, mean_squared_error +from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold +from sklearn.preprocessing import LabelEncoder + +from src.utils.parse_config import ( + load_config, get_model_and_hyperparams, get_data_config, + get_logging_config, get_training_config, validate_config +) +from src.DataModule import DataModule + + +class HyperparameterTuner: + """Handles hyperparameter optimization using Optuna with new architecture.""" + + def __init__(self, config_path: str): + """Initialize tuner with configuration.""" + self.config_path = config_path + self.config = load_config(config_path) + + # Validate configuration + validate_config(self.config) + + # Extract configurations + self.model_class, self.initial_hyperparams = get_model_and_hyperparams(self.config) + self.model_name = self.config["model"]["model_name"] + self.task = self.config["model"]["task"] + self.data_config = get_data_config(self.config) + self.logging_config = get_logging_config(self.config) + self.training_config = get_training_config(self.config) + + # Initialize components + self.data_module = None + self.label_encoder = None + self.X = None + self.y = None + self.fitted_preprocessor = None + + print(f"Initializing hyperparameter tuning for {self.model_name} ({self.task})") + + def setup_mlflow(self) -> None: + try: + import requests + print(f"Setting up MLflow tracking at {self.logging_config['mlflow_tracking_uri']}") + # Test connection with timeout + response = requests.get(self.logging_config['mlflow_tracking_uri'], timeout=5) + mlflow.set_tracking_uri(self.logging_config['mlflow_tracking_uri']) + experiment_name = f"{self.model_name}_{self.task}_tuning" + mlflow.set_experiment(experiment_name) + print(f"MLflow experiment: {experiment_name}") + except (requests.exceptions.RequestException, Exception) as e: + print(f"MLflow server not available: {e}") + print("Using local file tracking instead") + mlflow.set_tracking_uri("file:./mlflow_runs") + experiment_name = f"{self.model_name}_{self.task}_tuning" + mlflow.set_experiment(experiment_name) + + def load_and_prepare_data(self) -> None: + """Load and prepare data for tuning using new architecture.""" + print("\n" + "="*60) + print("LOADING AND PREPARING DATA FOR TUNING") + print("="*60) + + # Initialize DataModule with visualization disabled for tuning + self.data_module = DataModule( + data_path=self.data_config['data_path'], + target_column=self.data_config['target_column'], + columns_to_drop=self.data_config.get('columns_to_drop', []), + preprocessor_settings=self.data_config.get('preprocessor_settings', {}), + visualise=False, # Disable visualization during tuning + check_imbalance=self.data_config.get('check_imbalance', False), + test_size=self.data_config.get('test_size', 0.2), + stratify=self.data_config.get('stratify', False), + random_state=self.data_config.get('random_state', 42), + feature_engineering=self.data_config.get('feature_engineering', False) + ) + + # Load and prepare data (DataLoader handles the heavy lifting) + self.X, self.y = self.data_module.load_and_prepare() + + # Encode target for classification + if self.task == "classification": + self.label_encoder = LabelEncoder() + self.y = pd.Series( + self.label_encoder.fit_transform(self.y), + index=self.y.index, + name=self.y.name + ) + print(f"Target encoded. Classes: {list(self.label_encoder.classes_)}") + + # Fit preprocessor once for all trials + self.fitted_preprocessor = self.data_module.create_and_fit_preprocessor(self.X) + print("Preprocessor fitted for optimization") + + # Print data summary + print(f"\nDATA SUMMARY FOR TUNING:") + print(f" Features shape: {self.X.shape}") + print(f" Target shape: {self.y.shape}") + print(f" Numerical features: {len(self.data_module.numerical_features)}") + print(f" Categorical features: {len(self.data_module.categorical_features)}") + + def suggest_hyperparameters(self, trial: optuna.Trial) -> Dict[str, Any]: + """Suggest hyperparameters for a trial based on model type.""" + params = {} + + if self.model_name == "random_forest": + params.update({ + 'n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50), + 'max_depth': trial.suggest_int('max_depth', 3, 20), + 'min_samples_split': trial.suggest_int('min_samples_split', 2, 20), + 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10), + 'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]), + 'random_state': self.data_config.get('random_state', 42) + }) + + elif self.model_name == "xgboost": + params.update({ + 'n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50), + 'max_depth': trial.suggest_int('max_depth', 3, 12), + 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True), + 'subsample': trial.suggest_float('subsample', 0.6, 1.0), + 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), + 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10), + 'gamma': trial.suggest_float('gamma', 0, 0.5), + 'reg_alpha': trial.suggest_float('reg_alpha', 0, 1), + 'reg_lambda': trial.suggest_float('reg_lambda', 0, 1), + 'random_state': self.data_config.get('random_state', 42) + }) + + # Task-specific parameters + if self.task == "classification": + params['objective'] = 'multi:softprob' if len(self.label_encoder.classes_) > 2 else 'binary:logistic' + params['eval_metric'] = 'mlogloss' if len(self.label_encoder.classes_) > 2 else 'logloss' + else: + params['objective'] = 'reg:squarederror' + params['eval_metric'] = 'rmse' + + elif self.model_name == "linear_regression": + params.update({ + 'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]) + }) + + elif self.model_name == "logistic_regression": + params.update({ + 'C': trial.suggest_float('C', 0.001, 100, log=True), + 'penalty': trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', None]), + 'solver': trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga']), + 'max_iter': trial.suggest_int('max_iter', 100, 1000), + 'random_state': self.data_config.get('random_state', 42) + }) + + # Handle parameter constraints + if params['penalty'] == 'elasticnet': + params['l1_ratio'] = trial.suggest_float('l1_ratio', 0, 1) + if params['solver'] not in ['saga']: + params['solver'] = 'saga' + elif params['penalty'] == 'l1': + if params['solver'] not in ['liblinear', 'saga']: + params['solver'] = 'liblinear' + elif params['penalty'] is None: + if params['solver'] not in ['lbfgs', 'newton-cg', 'sag', 'saga']: + params['solver'] = 'lbfgs' + + return params + + def objective(self, trial: optuna.Trial) -> float: + """Objective function for Optuna optimization.""" + # Get hyperparameters for this trial + hyperparams = self.suggest_hyperparameters(trial) + + try: + # Transform data + X_transformed = self.fitted_preprocessor.transform(self.X) + + # Create model with suggested hyperparameters + model = self.model_class(**hyperparams) + + # Setup cross-validation + cv_folds = self.training_config.get('n_splits', 5) + random_state = self.training_config.get('random_state_cv', 42) + + if self.task == "classification" and self.data_config.get('stratify', False): + cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state) + scoring = 'accuracy' + elif self.task == "classification": + cv = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state) + scoring = 'accuracy' + else: + cv = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state) + scoring = 'neg_mean_squared_error' + + # Perform cross-validation + cv_scores = cross_val_score(model, X_transformed, self.y, cv=cv, scoring=scoring, n_jobs=-1) + + # Calculate metric + if self.task == "classification": + metric = cv_scores.mean() + else: + metric = np.sqrt(-cv_scores.mean()) # Convert to RMSE and negate for minimization + + return metric + + except Exception as e: + print(f"Trial failed with error: {e}") + # Return worst possible score for failed trials + return 0.0 if self.task == "classification" else float('inf') + + def run_optimization(self) -> optuna.Study: + """Run Optuna optimization.""" + print(f"\nSTARTING HYPERPARAMETER OPTIMIZATION") + print(f" Trials: {self.training_config['n_trials']}") + print(f" Timeout: {self.training_config['timeout']} seconds") + print(f" Direction: {self.training_config['optuna_direction']}") + print("-" * 50) + + # Create study + direction = self.training_config['optuna_direction'] + if self.task == "regression" and direction == "maximize": + direction = "minimize" # RMSE should be minimized + + study = optuna.create_study( + direction=direction, + sampler=optuna.samplers.TPESampler(seed=self.training_config['random_state_cv']) + ) + + # Run optimization + study.optimize( + self.objective, + n_trials=self.training_config['n_trials'], + timeout=self.training_config['timeout'], + show_progress_bar=True + ) + + return study + + def save_best_parameters(self, study: optuna.Study) -> str: + """Save best parameters to JSON file.""" + print(f"\nSAVING OPTIMIZATION RESULTS") + print("-" * 35) + + # Create models directory + models_dir = Path("models") + models_dir.mkdir(exist_ok=True) + + # Save best parameters + best_params_path = models_dir / f"best_params_{self.model_name}.json" + with open(best_params_path, 'w') as f: + json.dump(study.best_params, f, indent=2) + + print(f"Best parameters saved to: {best_params_path}") + + # Print results summary + print(f"\nOPTIMIZATION COMPLETED") + print(f" Best value: {study.best_value:.4f}") + print(f" Best parameters:") + for param, value in study.best_params.items(): + print(f" {param}: {value}") + + return str(best_params_path) + + def save_study_visualization(self, study: optuna.Study) -> None: + """Save optimization visualizations if optuna visualization is available.""" + try: + import optuna.visualization as vis + + # Create plots directory + plots_dir = Path("plots/optimization") + plots_dir.mkdir(parents=True, exist_ok=True) + + # Optimization history + fig = vis.plot_optimization_history(study) + fig.write_html(plots_dir / "optimization_history.html") + + # Parameter importance + if len(study.trials) > 1: + fig = vis.plot_param_importances(study) + fig.write_html(plots_dir / "parameter_importance.html") + + # Parallel coordinate plot + if len(study.trials) > 1: + fig = vis.plot_parallel_coordinate(study) + fig.write_html(plots_dir / "parallel_coordinate.html") + + print(f"Optimization visualizations saved to: {plots_dir}") + + except ImportError: + print("Optuna visualization not available. Install with: pip install optuna[visualization]") + except Exception as e: + print(f"Could not create visualizations: {e}") + + def run(self) -> None: + """Run the complete tuning pipeline.""" + if not self.training_config.get('use_optuna', False): + print("Optuna tuning is disabled in configuration. Exiting.") + return + + print("\n" + "="*80) + print("STARTING HYPERPARAMETER OPTIMIZATION PIPELINE") + print("="*80) + + try: + # Setup MLflow + self.setup_mlflow() + + with mlflow.start_run(run_name=f"{self.model_name}_{self.task}_tuning"): + # Load and prepare data + self.load_and_prepare_data() + + # Log configuration parameters + mlflow.log_params({ + 'model_name': self.model_name, + 'task': self.task, + 'n_trials': self.training_config['n_trials'], + 'timeout': self.training_config['timeout'], + 'optuna_direction': self.training_config['optuna_direction'], + **{k: v for k, v in self.data_config.items() if not isinstance(v, dict)} + }) + + # Log additional info + if self.label_encoder is not None: + mlflow.log_param('target_classes', list(self.label_encoder.classes_)) + + mlflow.log_params({ + 'features_count': self.X.shape[1], + 'samples_count': self.X.shape[0], + 'numerical_features_count': len(self.data_module.numerical_features), + 'categorical_features_count': len(self.data_module.categorical_features) + }) + + # Run optimization + study = self.run_optimization() + + # Log best results + mlflow.log_metric('best_value', study.best_value) + mlflow.log_params({f"best_{k}": v for k, v in study.best_params.items()}) + + # Save results + best_params_path = self.save_best_parameters(study) + mlflow.log_artifact(best_params_path) + + # Save visualizations + self.save_study_visualization(study) + + # Log study statistics + mlflow.log_metrics({ + 'n_trials_completed': len(study.trials), + 'n_trials_pruned': len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]), + 'n_trials_failed': len([t for t in study.trials if t.state == optuna.trial.TrialState.FAIL]) + }) + + print(f"\n" + "="*80) + print("HYPERPARAMETER TUNING COMPLETED SUCCESSFULLY!") + print("="*80) + print(f"MLflow run ID: {mlflow.active_run().info.run_id}") + print(f"Best parameters saved to: {best_params_path}") + print("Run train.py next to train the model with optimized parameters") + print("="*80) + + except Exception as e: + print(f"\nTUNING FAILED: {str(e)}") + import traceback + traceback.print_exc() + raise + finally: + mlflow.end_run() + + +def main(): + """Main tuning function.""" + import argparse + + parser = argparse.ArgumentParser(description='Tune hyperparameters for ML model') + parser.add_argument( + '--config', + default='/Users/ksonar/Documents/Technical/project-template/Structured_data_template/train/config/local_config.cfg', + help='Path to configuration file' + ) + + args = parser.parse_args() + + # Handle relative paths + config_path = Path(args.config) + if not config_path.is_absolute(): + config_path = Path.cwd() / config_path + + if not config_path.exists(): + print(f"Configuration file not found: {config_path}") + sys.exit(1) + + # Create and run tuner + tuner = HyperparameterTuner(str(config_path)) + tuner.run() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Structured_data_template/train/src/DataLoader.py b/Structured_data_template/train/src/DataLoader.py new file mode 100644 index 0000000..43cd6e4 --- /dev/null +++ b/Structured_data_template/train/src/DataLoader.py @@ -0,0 +1,427 @@ +import pandas as pd +import numpy as np +from typing import Tuple, List, Dict, Any, Optional, Union +from pathlib import Path +import matplotlib.pyplot as plt +import seaborn as sns + +# Import utility modules +from .utils.data_utils import FeatureEngineer, FeatureSelector, DataTransformer, DataProfiler +from .utils.visualise import DataVisualizer + + +class DataLoader: + """ + Enhanced class to load, clean, visualize and prepare data for machine learning. + Handles initial data preparation before preprocessing pipeline creation. + """ + + def __init__(self, data_path: str, target_column: str, columns_to_drop: Optional[List[str]] = None, + visualise: bool = False, check_imbalance: bool = False, + random_state: int = 42, feature_engineering: bool = False): + """ + Initialize DataLoader. + + Args: + data_path: Path to the CSV data file + target_column: Name of the target column + columns_to_drop: List of columns to drop + visualise: Whether to generate visualizations + check_imbalance: Whether to check class imbalance + random_state: Random state for reproducibility + feature_engineering: Whether to perform automated feature engineering + """ + self.data_path = data_path + self.target_column = target_column + self.columns_to_drop = columns_to_drop or [] + self.visualise = visualise + self.check_imbalance = check_imbalance + self.random_state = random_state + self.feature_engineering = feature_engineering + + # Internal attributes + self.data: Optional[pd.DataFrame] = None + self.numerical_features: List[str] = [] + self.categorical_features: List[str] = [] + + # Utility classes + self.feature_engineer = FeatureEngineer(random_state=random_state) if feature_engineering else None + self.feature_selector = FeatureSelector(random_state=random_state) + self.data_transformer = DataTransformer(random_state=random_state) + self.data_profiler = DataProfiler() + self.visualizer = DataVisualizer() if visualise else None + + print(f"DataLoader initialized for target: {target_column}") + + def load_and_prepare_data(self) -> Tuple[pd.DataFrame, pd.Series]: + """ + Main method to load and prepare data for ML pipeline. + + Returns: + tuple: Prepared features (X) and target (y) + """ + print("\n=== DATA LOADING AND PREPARATION ===") + + # Load raw data + self._load_data() + + # Generate data profile if requested + if self.visualise: + self._generate_data_profile() + + # Data cleaning and preparation + self._clean_data() + + # Drop specified columns + self._drop_columns() + + # Validate target column + self._validate_target_column() + + # Split features and target + X, y = self._split_features_target() + + # Infer column types + self._infer_column_types(X) + + # Data quality improvements + X = self._improve_data_quality(X) + + # Feature engineering (if enabled) + if self.feature_engineering and self.feature_engineer: + X = self._perform_feature_engineering(X) + + # Visualization + if self.visualise and self.visualizer: + self._create_visualizations(X, y) + + # Check target imbalance + if self.check_imbalance: + self._check_target_imbalance(y) + + print(f"Data preparation completed. Final shape: {X.shape}") + return X, y + + def _load_data(self) -> None: + """Load data from CSV file.""" + try: + print(f"Loading data from: {self.data_path}") + self.data = pd.read_csv(self.data_path) + print(f"Data loaded successfully. Shape: {self.data.shape}") + print(f"Columns: {list(self.data.columns)}") + except FileNotFoundError: + raise FileNotFoundError(f"Data file not found: {self.data_path}") + except Exception as e: + raise IOError(f"Error loading data: {e}") + + def _generate_data_profile(self) -> None: + """Generate comprehensive data profile.""" + print("\nGenerating comprehensive data profile") + profile = self.data_profiler.generate_data_profile(self.data, self.target_column) + + # Save profile report + profile_path = self.data_profiler.save_profile_report(profile, "data_profile_report.json") + + # Print key insights + overview = profile.get('overview', {}) + print(f" Dataset shape: {overview.get('shape', 'Unknown')}") + print(f" Memory usage: {overview.get('memory_usage_mb', 0):.2f} MB") + print(f" Data quality score: {profile.get('data_quality', {}).get('overall_quality', 0):.2f}") + + def _clean_data(self) -> None: + """Perform basic data cleaning operations.""" + print("\nPerforming data cleaning") + + initial_shape = self.data.shape + + # Remove completely empty rows and columns. + self.data = self.data.dropna(how='all').dropna(axis=1, how='all') + + # Remove duplicate rows + initial_rows = len(self.data) + self.data = self.data.drop_duplicates() + duplicates_removed = initial_rows - len(self.data) + + if duplicates_removed > 0: + print(f" Removed {duplicates_removed} duplicate rows") + + # Basic data type optimization + self._optimize_data_types() + + final_shape = self.data.shape + if initial_shape != final_shape: + print(f" Shape after cleaning: {initial_shape} -> {final_shape}") + + def _optimize_data_types(self) -> None: + """Optimize data types to reduce memory usage.""" + for col in self.data.columns: + col_type = self.data[col].dtype + + if col_type == 'object': + # Try to convert to category if few unique values + if self.data[col].nunique() / len(self.data) < 0.5: + self.data[col] = self.data[col].astype('category') + + elif col_type in ['int64', 'float64']: + # Downcast numeric types + if col_type == 'int64': + if self.data[col].min() >= 0: + if self.data[col].max() <= 255: + self.data[col] = self.data[col].astype('uint8') + elif self.data[col].max() <= 65535: + self.data[col] = self.data[col].astype('uint16') + elif self.data[col].max() <= 4294967295: + self.data[col] = self.data[col].astype('uint32') + else: + if self.data[col].min() >= -128 and self.data[col].max() <= 127: + self.data[col] = self.data[col].astype('int8') + elif self.data[col].min() >= -32768 and self.data[col].max() <= 32767: + self.data[col] = self.data[col].astype('int16') + elif self.data[col].min() >= -2147483648 and self.data[col].max() <= 2147483647: + self.data[col] = self.data[col].astype('int32') + + def _drop_columns(self) -> None: + """Drop specified columns from the dataset.""" + if not self.columns_to_drop: + return + + initial_columns = set(self.data.columns) + + # Handle case where columns_to_drop might be a single string + if isinstance(self.columns_to_drop, str): + self.columns_to_drop = [col.strip() for col in self.columns_to_drop.split(',')] + + self.data = self.data.drop(columns=self.columns_to_drop, errors='ignore') + dropped_actual = initial_columns - set(self.data.columns) + + if dropped_actual: + print(f"Dropped columns: {', '.join(sorted(dropped_actual))}") + + def _validate_target_column(self) -> None: + """Validate that target column exists in data.""" + if self.target_column not in self.data.columns: + raise ValueError(f"Target column '{self.target_column}' not found in data") + + def _split_features_target(self) -> Tuple[pd.DataFrame, pd.Series]: + """Split data into features and target.""" + X = self.data.drop(columns=[self.target_column]) + y = self.data[self.target_column] + return X, y + + def _infer_column_types(self, X: pd.DataFrame) -> None: + """Infer numerical and categorical column types.""" + self.numerical_features = X.select_dtypes(include=[np.number]).columns.tolist() + self.categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist() + + print(f"Inferred column types:") + print(f" Numerical features ({len(self.numerical_features)}): {self.numerical_features}") + print(f" Categorical features ({len(self.categorical_features)}): {self.categorical_features}") + + def _improve_data_quality(self, X: pd.DataFrame) -> pd.DataFrame: + """Improve data quality through various techniques.""" + print("\nImproving data quality") + + # Handle outliers in numerical columns + if self.numerical_features: + X = self.data_transformer.handle_outliers( + X, self.numerical_features, method='iqr', factor=1.5 + ) + + # Apply transformations if needed + skewed_features = [] + for col in self.numerical_features: + if col in X.columns and pd.api.types.is_numeric_dtype(X[col]): + skewness = X[col].skew() + if abs(skewness) > 2: # Highly skewed + skewed_features.append(col) + + if skewed_features: + print(f"Applying log transformation to skewed features: {skewed_features}") + X = self.data_transformer.apply_log_transform(X, skewed_features) + + return X + + def _perform_feature_engineering(self, X: pd.DataFrame) -> pd.DataFrame: + """Perform automated feature engineering.""" + print("\nPerforming feature engineering") + + # Create interaction features for top numerical features + if len(self.numerical_features) >= 2: + top_numerical = self.numerical_features[:min(5, len(self.numerical_features))] + X = self.feature_engineer.create_interaction_features(X, top_numerical, max_combinations=5) + + # Create polynomial features for selected numerical columns + if self.numerical_features: + # Select columns with reasonable ranges for polynomial features + suitable_cols = [] + for col in self.numerical_features[:3]: # Limit to first 3 to avoid explosion + if col in X.columns: + col_range = X[col].max() - X[col].min() + if col_range > 0 and col_range < 1000: # Reasonable range + suitable_cols.append(col) + + if suitable_cols: + X = self.feature_engineer.create_polynomial_features( + X, suitable_cols, degree=2, include_bias=False + ) + + # Create binning features for numerical columns with high cardinality + for col in self.numerical_features: + if col in X.columns and X[col].nunique() > 20: + X = self.feature_engineer.create_binning_features(X, col, n_bins=5, strategy='quantile') + + # Print summary + if self.feature_engineer: + summary = self.feature_engineer.get_feature_creation_summary() + print(f"Feature engineering summary: {summary['total_features_created']} new features created") + + return X + + def _create_visualizations(self, X: pd.DataFrame, y: pd.Series) -> None: + """Create comprehensive visualizations.""" + print("\nCreating visualizations") + + # Combine features and target for visualization + viz_data = X.copy() + viz_data[self.target_column] = y + + # Basic distributions + self.visualizer.plot_distributions( + viz_data, + target_column=self.target_column, + save_name="feature_distributions" + ) + + # Correlation matrix for numerical features + if self.numerical_features: + self.visualizer.plot_correlation_matrix( + viz_data, + save_name="correlation_matrix" + ) + + # Missing data patterns + self.visualizer.plot_missing_data_pattern( + viz_data, + save_name="missing_data_patterns" + ) + + # Outlier analysis + self.visualizer.plot_outliers_analysis( + viz_data, + save_name="outlier_analysis" + ) + + def _check_target_imbalance(self, y: pd.Series) -> None: + """Check and report target class imbalance.""" + class_counts = y.value_counts().sort_index() + total_samples = len(y) + + print(f"\nTarget variable '{self.target_column}' distribution:") + for class_name, count in class_counts.items(): + percentage = (count / total_samples) * 100 + print(f" {class_name}: {count:,} samples ({percentage:.1f}%)") + + # Check for significant imbalance + min_percentage = (class_counts.min() / total_samples) * 100 + if min_percentage < 10: + print(f"WARNING: Class imbalance detected! Smallest class: {min_percentage:.1f}%") + print(" Consider using stratified sampling or class balancing techniques.") + + @staticmethod + def impute_data(df: pd.DataFrame, categorical_columns: List[str], numerical_columns: List[str]) -> pd.DataFrame: + """ + Basic imputation method (kept for backward compatibility). + + Args: + df: DataFrame to impute + categorical_columns: List of categorical column names + numerical_columns: List of numerical column names + + Returns: + DataFrame with imputed values + """ + df_imputed = df.copy() + + for col in categorical_columns: + if col in df_imputed.columns: + df_imputed[col] = df_imputed[col].fillna(df_imputed[col].mode()[0] if not df_imputed[col].mode().empty else 'Unknown') + + for col in numerical_columns: + if col in df_imputed.columns: + df_imputed[col] = df_imputed[col].fillna(df_imputed[col].mean()) + + return df_imputed + + @staticmethod + def remove_outliers(df: pd.DataFrame, numerical_columns: List[str], threshold: float = 1.5) -> pd.DataFrame: + """ + Remove outliers using IQR method (kept for backward compatibility). + + Args: + df: DataFrame to process + numerical_columns: List of numerical column names + threshold: IQR threshold for outlier detection + + Returns: + DataFrame with outliers removed + """ + df_clean = df.copy() + + for col in numerical_columns: + if col in df_clean.columns: + Q1 = df_clean[col].quantile(0.25) + Q3 = df_clean[col].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - threshold * IQR + upper_bound = Q3 + threshold * IQR + df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)] + + return df_clean + + @staticmethod + def normalize_data(df: pd.DataFrame, numerical_columns: List[str]) -> pd.DataFrame: + """ + Normalize numerical data (kept for backward compatibility). + + Args: + df: DataFrame to normalize + numerical_columns: List of numerical column names + + Returns: + DataFrame with normalized values + """ + df_normalized = df.copy() + + for col in numerical_columns: + if col in df_normalized.columns: + df_normalized[col] = (df_normalized[col] - df_normalized[col].mean()) / df_normalized[col].std() + + return df_normalized + + def get_data_summary(self) -> Dict[str, Any]: + """ + Get comprehensive data summary. + + Returns: + Dictionary containing data summary statistics + """ + if self.data is None: + return {} + + summary = { + 'original_shape': self.data.shape, + 'columns': self.data.columns.tolist(), + 'numerical_features': self.numerical_features, + 'categorical_features': self.categorical_features, + 'missing_values': self.data.isnull().sum().to_dict(), + 'dtypes': self.data.dtypes.astype(str).to_dict() + } + + if self.target_column in self.data.columns: + summary['target_distribution'] = self.data[self.target_column].value_counts().to_dict() + + # Add feature engineering summary if available + if self.feature_engineer: + summary['feature_engineering'] = self.feature_engineer.get_feature_creation_summary() + + return summary \ No newline at end of file diff --git a/Structured_data_template/train/src/DataModule.py b/Structured_data_template/train/src/DataModule.py new file mode 100644 index 0000000..f1aba21 --- /dev/null +++ b/Structured_data_template/train/src/DataModule.py @@ -0,0 +1,368 @@ +import pandas as pd +import numpy as np +from typing import Tuple, List, Dict, Any, Optional, Union +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from pathlib import Path + +# Import DataLoader +from .DataLoader import DataLoader + + +class DataModule: + """ + Streamlined class focused on preprocessing pipeline creation and data splitting. + Works in conjunction with DataLoader for complete ML data preparation. + """ + + def __init__(self, data_path: str, target_column: str, columns_to_drop: Optional[List[str]] = None, + preprocessor_settings: Optional[Dict[str, Any]] = None, visualise: bool = False, + check_imbalance: bool = False, test_size: float = 0.2, stratify: bool = False, + random_state: int = 42, feature_engineering: bool = False): + """ + Initialize DataModule. + + Args: + data_path: Path to the CSV data file + target_column: Name of the target column + columns_to_drop: List of columns to drop + preprocessor_settings: Settings for preprocessing pipelines + visualise: Whether to generate visualizations + check_imbalance: Whether to check class imbalance + test_size: Proportion of data for testing + stratify: Whether to use stratified splitting + random_state: Random state for reproducibility + feature_engineering: Whether to perform automated feature engineering + """ + self.data_path = data_path + self.target_column = target_column + self.columns_to_drop = columns_to_drop or [] + self.preprocessor_settings = preprocessor_settings or {} + self.test_size = test_size + self.stratify = stratify + self.random_state = random_state + + # Initialize DataLoader + self.data_loader = DataLoader( + data_path=data_path, + target_column=target_column, + columns_to_drop=columns_to_drop, + visualise=visualise, + check_imbalance=check_imbalance, + random_state=random_state, + feature_engineering=feature_engineering + ) + + # Internal attributes + self.X: Optional[pd.DataFrame] = None + self.y: Optional[pd.Series] = None + self.preprocessor: Optional[ColumnTransformer] = None + self.numerical_features: List[str] = [] + self.categorical_features: List[str] = [] + + def load_and_prepare(self) -> Tuple[pd.DataFrame, pd.Series]: + """ + Load data using DataLoader and prepare preprocessing pipeline. + + Returns: + tuple: Features (X) and target (y) DataFrames + """ + print("\n=== DATAMODULE: PREPROCESSING PIPELINE SETUP ===") + + # Use DataLoader to get clean, prepared data + self.X, self.y = self.data_loader.load_and_prepare_data() + + # Get feature types from DataLoader + self.numerical_features = self.data_loader.numerical_features.copy() + self.categorical_features = self.data_loader.categorical_features.copy() + + # Update feature types after any transformations + self._update_feature_types() + + # Setup preprocessor + self._setup_preprocessor() + + return self.X, self.y + + def _update_feature_types(self) -> None: + """Update feature types after data loading and transformations.""" + # Re-infer column types in case DataLoader created new features + current_numerical = self.X.select_dtypes(include=[np.number]).columns.tolist() + current_categorical = self.X.select_dtypes(include=['object', 'category']).columns.tolist() + + # Update only if there are significant changes + if set(current_numerical) != set(self.numerical_features) or set(current_categorical) != set(self.categorical_features): + print(f"Updating feature types after data loading:") + print(f" Numerical: {len(self.numerical_features)} -> {len(current_numerical)}") + print(f" Categorical: {len(self.categorical_features)} -> {len(current_categorical)}") + + self.numerical_features = current_numerical + self.categorical_features = current_categorical + + def _setup_preprocessor(self) -> None: + """Setup the preprocessing pipeline using ColumnTransformer.""" + print("\nSetting up preprocessing pipeline...") + + transformers = [] + + # Numerical preprocessing + if self.numerical_features: + num_pipeline = self._create_numerical_pipeline() + if num_pipeline.steps: + transformers.append(('num', num_pipeline, self.numerical_features)) + + # Categorical preprocessing + if self.categorical_features: + cat_pipeline = self._create_categorical_pipeline() + if cat_pipeline.steps: + transformers.append(('cat', cat_pipeline, self.categorical_features)) + + # Create ColumnTransformer + if transformers: + self.preprocessor = ColumnTransformer( + transformers=transformers, + remainder='passthrough', + sparse_threshold=0 + ) + print(f"Preprocessor created with {len(transformers)} transformer(s)") + else: + # Fallback: passthrough preprocessor + self.preprocessor = ColumnTransformer( + transformers=[], + remainder='passthrough', + sparse_threshold=0 + ) + print("No active transformers configured, using passthrough") + + def _create_numerical_pipeline(self) -> Pipeline: + """Create numerical preprocessing pipeline.""" + steps = [] + + # Get numerical settings from config + num_settings = self.preprocessor_settings.get('numerical', {}) + + # Collect strategies across all columns + imputation_methods = set() + scaling_methods = set() + + for col_name, settings in num_settings.items(): + if col_name in self.numerical_features: + if 'imputer' in settings: + strategy = 'median' if settings['imputer'] == 'median' else 'mean' + imputation_methods.add(strategy) + + if 'scaler' in settings: + scaling_methods.add(settings['scaler']) + + # Add imputer if any numerical columns need imputation or if there are missing values + missing_values_exist = self.X[self.numerical_features].isnull().any().any() + if imputation_methods or missing_values_exist: + strategy = list(imputation_methods)[0] if imputation_methods else 'median' + steps.append(('imputer', SimpleImputer(strategy=strategy))) + print(f" Added numerical imputer: {strategy}") + + # Add scaler if specified or as default + if scaling_methods or num_settings: + scaler_type = list(scaling_methods)[0] if scaling_methods else 'standard' + if scaler_type == 'standard': + steps.append(('scaler', StandardScaler())) + elif scaler_type == 'minmax': + steps.append(('scaler', MinMaxScaler())) + print(f" Added numerical scaler: {scaler_type}") + + return Pipeline(steps) + + def _create_categorical_pipeline(self) -> Pipeline: + """Create categorical preprocessing pipeline.""" + steps = [] + + # Get categorical settings from config + cat_settings = self.preprocessor_settings.get('categorical', {}) + + # Add imputation step if needed + missing_values_exist = self.X[self.categorical_features].isnull().any().any() + imputation_needed = any('imputer' in settings for settings in cat_settings.values()) + + if imputation_needed or missing_values_exist: + steps.append(('imputer', SimpleImputer(strategy='most_frequent'))) + print(" Added categorical imputer: most_frequent") + + # Add encoding step + encoding_methods = set() + encoder_options = {} + + for col_name, settings in cat_settings.items(): + if col_name in self.categorical_features and 'encoder' in settings: + encoder_type = settings['encoder'] + encoding_methods.add(encoder_type) + if 'encoder_options' in settings: + encoder_options[encoder_type] = settings['encoder_options'] + + # Use specified encoder or default to OneHot + if encoding_methods or self.categorical_features: + encoder_type = list(encoding_methods)[0] if encoding_methods else 'onehot' + options = encoder_options.get(encoder_type, {}) + + if encoder_type == 'onehot': + # Set default options for OneHotEncoder + default_options = {'handle_unknown': 'ignore', 'sparse_output': False} + default_options.update(options) + steps.append(('encoder', OneHotEncoder(**default_options))) + print(f" Added categorical encoder: OneHot with options {default_options}") + elif encoder_type == 'ordinal': + default_options = {'handle_unknown': 'use_encoded_value', 'unknown_value': -1} + default_options.update(options) + steps.append(('encoder', OrdinalEncoder(**default_options))) + print(f" Added categorical encoder: Ordinal with options {default_options}") + + return Pipeline(steps) + + def create_and_fit_preprocessor(self, X: pd.DataFrame) -> ColumnTransformer: + """ + Fit the preprocessor on the provided data. + + Args: + X: Feature DataFrame to fit on + + Returns: + Fitted preprocessor + """ + if self.preprocessor is None: + raise RuntimeError("Preprocessor not set up. Call load_and_prepare() first.") + + print("Fitting preprocessor on training data...") + self.preprocessor.fit(X) + print("Preprocessor fitting completed") + return self.preprocessor + + def transform_data(self, X: pd.DataFrame) -> np.ndarray: + """ + Transform data using the fitted preprocessor. + + Args: + X: Data to transform + + Returns: + Transformed data array + """ + if self.preprocessor is None: + raise RuntimeError("Preprocessor not set up. Call load_and_prepare() first.") + + return self.preprocessor.transform(X) + + def perform_train_test_split(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: + """ + Perform train-test split. + + Args: + X: Features + y: Target + + Returns: + X_train, X_test, y_train, y_test + """ + print(f"\nPerforming train-test split (test_size={self.test_size})...") + + stratify_param = y if self.stratify else None + + X_train, X_test, y_train, y_test = train_test_split( + X, y, + test_size=self.test_size, + random_state=self.random_state, + stratify=stratify_param + ) + + split_type = "stratified" if self.stratify else "random" + print(f" {split_type.title()} split completed: Train={X_train.shape[0]}, Test={X_test.shape[0]}") + + return X_train, X_test, y_train, y_test + + @staticmethod + def get_feature_names_after_preprocessing(fitted_preprocessor: ColumnTransformer, + initial_feature_names: List[str]) -> List[str]: + """ + Get feature names after preprocessing. + + Args: + fitted_preprocessor: Fitted ColumnTransformer + initial_feature_names: Original feature names + + Returns: + List of feature names after preprocessing + """ + try: + if hasattr(fitted_preprocessor, 'get_feature_names_out'): + return fitted_preprocessor.get_feature_names_out(initial_feature_names).tolist() + else: + print("WARNING: Cannot determine feature names after preprocessing") + # Estimate number of features + dummy_df = pd.DataFrame(columns=initial_feature_names) + n_features = fitted_preprocessor.transform(dummy_df).shape[1] if len(dummy_df.columns) > 0 else 0 + return [f'feature_{i}' for i in range(n_features)] + except Exception as e: + print(f"WARNING: Error getting feature names: {e}") + return [] + + def get_data_summary(self) -> Dict[str, Any]: + """ + Get comprehensive data summary including DataLoader insights. + + Returns: + Dictionary containing data summary statistics + """ + summary = { + 'datamodule_info': { + 'numerical_features': self.numerical_features, + 'categorical_features': self.categorical_features, + 'preprocessor_configured': self.preprocessor is not None, + 'test_size': self.test_size, + 'stratify': self.stratify + } + } + + # Add DataLoader summary if available + if self.data_loader: + loader_summary = self.data_loader.get_data_summary() + summary.update(loader_summary) + + return summary + + def get_preprocessing_info(self) -> Dict[str, Any]: + """ + Get information about the preprocessing pipeline. + + Returns: + Dictionary with preprocessing pipeline details + """ + if self.preprocessor is None: + return {'status': 'not_configured'} + + info = { + 'status': 'configured', + 'transformers': [], + 'numerical_features_count': len(self.numerical_features), + 'categorical_features_count': len(self.categorical_features) + } + + # Extract transformer information + for name, transformer, columns in self.preprocessor.transformers_: + transformer_info = { + 'name': name, + 'type': type(transformer).__name__, + 'columns': columns if isinstance(columns, list) else list(columns), + 'steps': [] + } + + if hasattr(transformer, 'steps'): + for step_name, step_transformer in transformer.steps: + transformer_info['steps'].append({ + 'name': step_name, + 'type': type(step_transformer).__name__ + }) + + info['transformers'].append(transformer_info) + + return info \ No newline at end of file diff --git a/Structured_data_template/train/src/Data_explore.py b/Structured_data_template/train/src/Data_explore.py new file mode 100644 index 0000000..bcd0e5a --- /dev/null +++ b/Structured_data_template/train/src/Data_explore.py @@ -0,0 +1,384 @@ +# Interactive Data Exploration Notebook +# This notebook provides easy data visualization and insights using existing utilities + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import warnings +from pathlib import Path +import json + +# Import your existing utilities +from utils.data_utils import DataProfiler, FeatureEngineer, DataTransformer +from utils.visualise import DataVisualizer + +# For interactive widgets +try: + import ipywidgets as widgets + from IPython.display import display, HTML, clear_output + WIDGETS_AVAILABLE = True +except ImportError: + print("Install ipywidgets for interactive features: pip install ipywidgets") + WIDGETS_AVAILABLE = False + +# Configure plotting +plt.style.use('default') +sns.set_palette("husl") +warnings.filterwarnings('ignore') + +class InteractiveDataExplorer: + """Interactive data exploration tool using existing utilities.""" + + def __init__(self): + self.data = None + self.target_column = None + self.data_profiler = DataProfiler() + self.visualizer = DataVisualizer(output_dir="exploration_plots") + self.feature_engineer = FeatureEngineer() + self.data_transformer = DataTransformer() + self.profile = None + + def load_data(self, file_path): + """Load data and perform initial setup.""" + try: + self.data = pd.read_csv(file_path) + print(f"✓ Data loaded successfully!") + print(f"Shape: {self.data.shape}") + print(f"Columns: {list(self.data.columns)}") + return True + except Exception as e: + print(f"✗ Error loading data: {e}") + return False + + def quick_overview(self): + """Generate a quick data overview.""" + if self.data is None: + print("Please load data first!") + return + + print("="*60) + print("QUICK DATA OVERVIEW") + print("="*60) + + # Basic info + print(f"Dataset Shape: {self.data.shape}") + print(f"Memory Usage: {self.data.memory_usage(deep=True).sum() / (1024*1024):.2f} MB") + print(f"Duplicate Rows: {self.data.duplicated().sum()}") + + # Column types + print(f"\nColumn Types:") + print(f" Numerical: {len(self.data.select_dtypes(include=[np.number]).columns)}") + print(f" Categorical: {len(self.data.select_dtypes(include=['object', 'category']).columns)}") + print(f" Datetime: {len(self.data.select_dtypes(include=['datetime64']).columns)}") + + # Missing data + missing_data = self.data.isnull().sum() + columns_with_missing = missing_data[missing_data > 0] + if len(columns_with_missing) > 0: + print(f"\nColumns with Missing Data:") + for col, count in columns_with_missing.items(): + pct = (count / len(self.data)) * 100 + print(f" {col}: {count} ({pct:.1f}%)") + else: + print(f"\nNo missing data found!") + + def generate_comprehensive_profile(self, target_col=None): + """Generate comprehensive data profile.""" + if self.data is None: + print("Please load data first!") + return + + self.target_column = target_col + print("Generating comprehensive data profile...") + + self.profile = self.data_profiler.generate_data_profile( + self.data, + target_col=target_col + ) + + # Save profile + self.data_profiler.save_profile_report(self.profile, "exploration_data_profile.json") + + # Display key insights + self._display_profile_insights() + + def _display_profile_insights(self): + """Display key insights from the data profile.""" + if not self.profile: + return + + print("\n" + "="*60) + print("DATA PROFILE INSIGHTS") + print("="*60) + + # Data Quality Score + quality = self.profile.get('data_quality', {}) + if quality: + print(f"Overall Data Quality Score: {quality.get('overall_quality', 0):.2f}/1.00") + print(f" Completeness: {quality.get('completeness', 0):.2f}") + print(f" Uniqueness: {quality.get('uniqueness', 0):.2f}") + print(f" Consistency: {quality.get('consistency', 0):.2f}") + + # Missing data summary + missing = self.profile.get('missing_data', {}) + if missing: + print(f"\nMissing Data: {missing.get('missing_percentage_overall', 0):.1f}% overall") + print(f"Complete Rows: {missing.get('complete_rows_percentage', 0):.1f}%") + + # High correlations (if available) + corr = self.profile.get('correlations', {}) + high_corr = corr.get('high_correlations', []) + if high_corr: + print(f"\nHigh Correlations Found:") + for pair in high_corr[:5]: # Show top 5 + print(f" {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f}") + + # Target analysis (if available) + if self.target_column and 'target_analysis' in self.profile: + target_info = self.profile['target_analysis'] + print(f"\nTarget Variable '{self.target_column}':") + print(f" Type: {target_info.get('dtype', 'unknown')}") + print(f" Missing Values: {target_info.get('missing_count', 0)}") + print(f" Unique Values: {target_info.get('unique_count', 0)}") + print(f" Recommended Task: {target_info.get('recommended_task', 'unknown')}") + + def create_visualizations(self, target_col=None): + """Create comprehensive visualizations.""" + if self.data is None: + print("Please load data first!") + return + + print("Creating visualizations...") + + # Basic distributions + self.visualizer.plot_distributions( + self.data, + target_column=target_col, + save_name="exploration_distributions" + ) + + # Correlation matrix + numerical_data = self.data.select_dtypes(include=[np.number]) + if len(numerical_data.columns) > 1: + self.visualizer.plot_correlation_matrix( + self.data, + save_name="exploration_correlations" + ) + + # Missing data patterns + if self.data.isnull().any().any(): + self.visualizer.plot_missing_data_pattern( + self.data, + save_name="exploration_missing_patterns" + ) + + # Outlier analysis + if len(numerical_data.columns) > 0: + self.visualizer.plot_outliers_analysis( + self.data, + save_name="exploration_outliers" + ) + + print("Visualizations created in 'exploration_plots' directory!") + + def analyze_target_relationship(self, target_col): + """Analyze relationships with target variable.""" + if self.data is None or target_col not in self.data.columns: + print("Please load data and specify a valid target column!") + return + + print(f"\nAnalyzing relationships with target: '{target_col}'") + print("="*50) + + target_data = self.data[target_col] + + # Target distribution + print(f"Target Distribution:") + value_counts = target_data.value_counts() + for value, count in value_counts.items(): + pct = (count / len(target_data)) * 100 + print(f" {value}: {count} ({pct:.1f}%)") + + # Check for imbalance + min_pct = (value_counts.min() / len(target_data)) * 100 + if min_pct < 10: + print(f"\n Class imbalance detected! Smallest class: {min_pct:.1f}%") + + # Correlations with numerical features (if target is numerical) + numerical_cols = self.data.select_dtypes(include=[np.number]).columns.tolist() + if target_col in numerical_cols: + numerical_cols.remove(target_col) + if numerical_cols: + correlations = self.data[numerical_cols + [target_col]].corr()[target_col].drop(target_col) + top_corr = correlations.abs().sort_values(ascending=False).head(5) + + print(f"\nTop Correlations with {target_col}:") + for feature, corr in top_corr.items(): + print(f" {feature}: {corr:.3f}") + + def suggest_preprocessing_steps(self): + """Suggest preprocessing steps based on data analysis.""" + if self.data is None: + print("Please load data first!") + return + + print("\n" + "="*60) + print("PREPROCESSING RECOMMENDATIONS") + print("="*60) + + suggestions = [] + + # Missing data + missing_data = self.data.isnull().sum() + columns_with_missing = missing_data[missing_data > 0] + if len(columns_with_missing) > 0: + suggestions.append(" Handle missing data:") + for col, count in columns_with_missing.items(): + pct = (count / len(self.data)) * 100 + if pct > 50: + suggestions.append(f" - Consider dropping '{col}' ({pct:.1f}% missing)") + elif self.data[col].dtype in ['object', 'category']: + suggestions.append(f" - Impute '{col}' with mode/most frequent") + else: + suggestions.append(f" - Impute '{col}' with median/mean") + + # Duplicates + if self.data.duplicated().sum() > 0: + suggestions.append(f" Remove {self.data.duplicated().sum()} duplicate rows") + + # Outliers + numerical_cols = self.data.select_dtypes(include=[np.number]).columns + outlier_cols = [] + for col in numerical_cols: + Q1 = self.data[col].quantile(0.25) + Q3 = self.data[col].quantile(0.75) + IQR = Q3 - Q1 + outliers = self.data[(self.data[col] < Q1 - 1.5*IQR) | (self.data[col] > Q3 + 1.5*IQR)] + if len(outliers) > 0: + outlier_cols.append((col, len(outliers))) + + if outlier_cols: + suggestions.append(" Handle outliers:") + for col, count in outlier_cols: + pct = (count / len(self.data)) * 100 + suggestions.append(f" - '{col}' has {count} outliers ({pct:.1f}%)") + + # Skewed data + skewed_cols = [] + for col in numerical_cols: + skewness = abs(self.data[col].skew()) + if skewness > 2: + skewed_cols.append((col, skewness)) + + if skewed_cols: + suggestions.append(" Handle skewed distributions:") + for col, skew in skewed_cols: + suggestions.append(f" - Apply log/box-cox transformation to '{col}' (skew: {skew:.2f})") + + # Categorical encoding + categorical_cols = self.data.select_dtypes(include=['object', 'category']).columns + if len(categorical_cols) > 0: + suggestions.append(" Encode categorical variables:") + for col in categorical_cols: + unique_count = self.data[col].nunique() + if unique_count > 10: + suggestions.append(f" - Consider target encoding for '{col}' ({unique_count} categories)") + else: + suggestions.append(f" - One-hot encode '{col}' ({unique_count} categories)") + + # Feature engineering + if len(numerical_cols) >= 2: + suggestions.append("- Consider feature engineering:") + suggestions.append(" - Create interaction features between numerical variables") + suggestions.append(" - Create polynomial features for non-linear relationships") + + # Print suggestions + if suggestions: + for suggestion in suggestions: + print(suggestion) + else: + print("Data looks clean! No major preprocessing steps needed.") + + def interactive_column_analysis(self): + """Create interactive widgets for column analysis.""" + if not WIDGETS_AVAILABLE: + print("Interactive widgets not available. Install ipywidgets.") + return + + if self.data is None: + print("Please load data first!") + return + + def analyze_column(column_name): + with output: + clear_output(wait=True) + col_data = self.data[column_name] + + print(f"Analysis for: {column_name}") + print("="*40) + print(f"Data Type: {col_data.dtype}") + print(f"Non-null Values: {col_data.count()}/{len(col_data)} ({col_data.count()/len(col_data)*100:.1f}%)") + print(f"Unique Values: {col_data.nunique()}") + + if pd.api.types.is_numeric_dtype(col_data): + print(f"\nStatistics:") + print(f" Mean: {col_data.mean():.2f}") + print(f" Std: {col_data.std():.2f}") + print(f" Min: {col_data.min():.2f}") + print(f" Max: {col_data.max():.2f}") + print(f" Skewness: {col_data.skew():.2f}") + + # Simple histogram + plt.figure(figsize=(8, 4)) + plt.hist(col_data.dropna(), bins=20, alpha=0.7, edgecolor='black') + plt.title(f'Distribution of {column_name}') + plt.xlabel(column_name) + plt.ylabel('Frequency') + plt.show() + + else: + print(f"\nTop Values:") + top_values = col_data.value_counts().head(10) + for value, count in top_values.items(): + pct = (count / len(col_data)) * 100 + print(f" {value}: {count} ({pct:.1f}%)") + + # Simple bar plot + plt.figure(figsize=(10, 4)) + top_values.plot(kind='bar') + plt.title(f'Top Values for {column_name}') + plt.xticks(rotation=45) + plt.tight_layout() + plt.show() + + # Create dropdown widget + column_dropdown = widgets.Dropdown( + options=list(self.data.columns), + value=self.data.columns[0], + description='Column:', + style={'description_width': 'initial'} + ) + + output = widgets.Output() + + # Connect widget to function + widgets.interact(analyze_column, column_name=column_dropdown) + + display(output) + +# Initialize the explorer +explorer = InteractiveDataExplorer() + + + +# Example with the obesity dataset +explorer.load_data('/Users/ksonar/Documents/Technical/project-template/data/heart.csv') +explorer.quick_overview() +explorer.generate_comprehensive_profile(target_col='target') +explorer.create_visualizations(target_col='target') +explorer.analyze_target_relationship('target') +explorer.suggest_preprocessing_steps() + +# Interactive analysis (if widgets available) +explorer.interactive_column_analysis() diff --git a/Structured_data_template/train/src/__init__.py b/Structured_data_template/train/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Structured_data_template/train/src/utils/__init__.py b/Structured_data_template/train/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Structured_data_template/train/src/utils/data_utils.py b/Structured_data_template/train/src/utils/data_utils.py new file mode 100644 index 0000000..cb6b9ee --- /dev/null +++ b/Structured_data_template/train/src/utils/data_utils.py @@ -0,0 +1,858 @@ +""" +Data utilities for advanced data processing and feature engineering. +Fixed JSON serialization issues. +""" + +import pandas as pd +import numpy as np +import json +from typing import Dict, List, Any, Optional, Tuple, Union +from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler +from sklearn.feature_selection import SelectKBest, f_classif, f_regression, mutual_info_classif, mutual_info_regression +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE +import warnings + + +class FeatureEngineer: + """Advanced feature engineering utilities.""" + + def __init__(self, random_state: int = 42): + self.random_state = random_state + self.created_features = [] + self.feature_creation_log = [] + + def create_polynomial_features(self, data: pd.DataFrame, columns: List[str], + degree: int = 2, include_bias: bool = False) -> pd.DataFrame: + """ + Create polynomial features for specified columns. + + Args: + data: Input DataFrame + columns: Columns to create polynomial features for + degree: Polynomial degree + include_bias: Whether to include bias column + + Returns: + DataFrame with polynomial features added + """ + from sklearn.preprocessing import PolynomialFeatures + + poly = PolynomialFeatures(degree=degree, include_bias=include_bias) + + # Create polynomial features + poly_features = poly.fit_transform(data[columns]) + + # Get feature names + feature_names = poly.get_feature_names_out(columns) + + # Create DataFrame with new features + poly_df = pd.DataFrame(poly_features, columns=feature_names, index=data.index) + + # Add to original data (excluding original columns to avoid duplication) + original_cols = set(columns) + new_cols = [col for col in poly_df.columns if col not in original_cols] + + result_df = data.copy() + result_df[new_cols] = poly_df[new_cols] + + self.created_features.extend(new_cols) + self.feature_creation_log.append(f"Created {len(new_cols)} polynomial features (degree={degree})") + + print(f"Created {len(new_cols)} polynomial features") + return result_df + + def create_interaction_features(self, data: pd.DataFrame, columns: List[str], + max_combinations: int = 10) -> pd.DataFrame: + """ + Create interaction features between specified columns. + + Args: + data: Input DataFrame + columns: Columns to create interactions for + max_combinations: Maximum number of combinations to create + + Returns: + DataFrame with interaction features added + """ + from itertools import combinations + + result_df = data.copy() + new_features = [] + + # Create pairwise interactions + for col1, col2 in combinations(columns, 2): + if len(new_features) >= max_combinations: + break + + # Skip if either column is not numeric + if not (pd.api.types.is_numeric_dtype(data[col1]) and pd.api.types.is_numeric_dtype(data[col2])): + continue + + interaction_name = f"{col1}_x_{col2}" + result_df[interaction_name] = data[col1] * data[col2] + new_features.append(interaction_name) + + self.created_features.extend(new_features) + self.feature_creation_log.append(f"Created {len(new_features)} interaction features") + + print(f"Created {len(new_features)} interaction features") + return result_df + + def create_binning_features(self, data: pd.DataFrame, column: str, + n_bins: int = 5, strategy: str = 'quantile') -> pd.DataFrame: + """ + Create binning features for a numerical column. + + Args: + data: Input DataFrame + column: Column to bin + n_bins: Number of bins + strategy: Binning strategy ('uniform', 'quantile', 'kmeans') + + Returns: + DataFrame with binning features added + """ + from sklearn.preprocessing import KBinsDiscretizer + + if not pd.api.types.is_numeric_dtype(data[column]): + print(f"Column {column} is not numeric, skipping binning") + return data + + result_df = data.copy() + + # Create bins + kbd = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy) + binned_values = kbd.fit_transform(data[[column]]).flatten() + + # Add binned feature + binned_column_name = f"{column}_binned_{strategy}" + result_df[binned_column_name] = binned_values + + # Create one-hot encoded version + for i in range(n_bins): + bin_name = f"{column}_bin_{i}" + result_df[bin_name] = (binned_values == i).astype(int) + self.created_features.append(bin_name) + + self.created_features.append(binned_column_name) + self.feature_creation_log.append(f"Created binning features for {column} ({n_bins} bins, {strategy})") + + print(f"Created binning features for {column}") + return result_df + + def create_aggregation_features(self, data: pd.DataFrame, group_col: str, + agg_cols: List[str], agg_funcs: List[str] = None) -> pd.DataFrame: + """ + Create aggregation features based on grouping. + + Args: + data: Input DataFrame + group_col: Column to group by + agg_cols: Columns to aggregate + agg_funcs: Aggregation functions to apply + + Returns: + DataFrame with aggregation features added + """ + if agg_funcs is None: + agg_funcs = ['mean', 'std', 'min', 'max', 'count'] + + result_df = data.copy() + new_features = [] + + for agg_col in agg_cols: + if not pd.api.types.is_numeric_dtype(data[agg_col]): + continue + + for func in agg_funcs: + try: + # Calculate aggregation + agg_values = data.groupby(group_col)[agg_col].transform(func) + + # Add to DataFrame + feature_name = f"{agg_col}_{func}_by_{group_col}" + result_df[feature_name] = agg_values + new_features.append(feature_name) + + except Exception as e: + print(f"Could not create {func} aggregation for {agg_col}: {e}") + continue + + self.created_features.extend(new_features) + self.feature_creation_log.append(f"Created {len(new_features)} aggregation features") + + print(f"Created {len(new_features)} aggregation features") + return result_df + + def create_datetime_features(self, data: pd.DataFrame, datetime_col: str) -> pd.DataFrame: + """ + Extract datetime features from a datetime column. + + Args: + data: Input DataFrame + datetime_col: DateTime column name + + Returns: + DataFrame with datetime features added + """ + result_df = data.copy() + + # Convert to datetime if not already + if not pd.api.types.is_datetime64_any_dtype(data[datetime_col]): + try: + result_df[datetime_col] = pd.to_datetime(data[datetime_col]) + except Exception as e: + print(f"Could not convert {datetime_col} to datetime: {e}") + return data + + dt_series = result_df[datetime_col] + new_features = [] + + # Extract various datetime components + datetime_features = { + f"{datetime_col}_year": dt_series.dt.year, + f"{datetime_col}_month": dt_series.dt.month, + f"{datetime_col}_day": dt_series.dt.day, + f"{datetime_col}_dayofweek": dt_series.dt.dayofweek, + f"{datetime_col}_hour": dt_series.dt.hour, + f"{datetime_col}_is_weekend": (dt_series.dt.dayofweek >= 5).astype(int), + f"{datetime_col}_quarter": dt_series.dt.quarter, + f"{datetime_col}_is_month_start": dt_series.dt.is_month_start.astype(int), + f"{datetime_col}_is_month_end": dt_series.dt.is_month_end.astype(int), + } + + for feature_name, feature_values in datetime_features.items(): + if not feature_values.isna().all(): # Only add if not all NaN + result_df[feature_name] = feature_values + new_features.append(feature_name) + + self.created_features.extend(new_features) + self.feature_creation_log.append(f"Created {len(new_features)} datetime features from {datetime_col}") + + print(f"Created {len(new_features)} datetime features") + return result_df + + def get_feature_creation_summary(self) -> Dict[str, Any]: + """Get summary of feature creation operations.""" + return { + 'total_features_created': len(self.created_features), + 'created_features': self.created_features, + 'creation_log': self.feature_creation_log + } + + +class FeatureSelector: + """Advanced feature selection utilities.""" + + def __init__(self, random_state: int = 42): + self.random_state = random_state + self.selected_features = {} + self.selection_scores = {} + + def select_k_best_features(self, X: pd.DataFrame, y: pd.Series, + k: int = 10, task: str = "classification") -> List[str]: + """ + Select k best features using statistical tests. + + Args: + X: Feature matrix + y: Target vector + k: Number of features to select + task: "classification" or "regression" + + Returns: + List of selected feature names + """ + if task == "classification": + selector = SelectKBest(score_func=f_classif, k=min(k, X.shape[1])) + else: + selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1])) + + selector.fit(X, y) + + # Get selected features + selected_mask = selector.get_support() + selected_features = X.columns[selected_mask].tolist() + + # Store scores + feature_scores = dict(zip(X.columns, selector.scores_)) + self.selection_scores['k_best'] = feature_scores + self.selected_features['k_best'] = selected_features + + print(f"Selected {len(selected_features)} best features using statistical tests") + return selected_features + + def select_mutual_info_features(self, X: pd.DataFrame, y: pd.Series, + threshold: float = 0.01, task: str = "classification") -> List[str]: + """ + Select features using mutual information. + + Args: + X: Feature matrix + y: Target vector + threshold: Minimum mutual information threshold + task: "classification" or "regression" + + Returns: + List of selected feature names + """ + if task == "classification": + mi_scores = mutual_info_classif(X, y, random_state=self.random_state) + else: + mi_scores = mutual_info_regression(X, y, random_state=self.random_state) + + # Select features above threshold + selected_mask = mi_scores > threshold + selected_features = X.columns[selected_mask].tolist() + + # Store scores + feature_scores = dict(zip(X.columns, mi_scores)) + self.selection_scores['mutual_info'] = feature_scores + self.selected_features['mutual_info'] = selected_features + + print(f"Selected {len(selected_features)} features using mutual information (threshold={threshold})") + return selected_features + + def select_correlation_features(self, X: pd.DataFrame, threshold: float = 0.95) -> List[str]: + """ + Remove highly correlated features. + + Args: + X: Feature matrix + threshold: Correlation threshold for removal + + Returns: + List of features to keep (low correlation) + """ + # Calculate correlation matrix + corr_matrix = X.corr().abs() + + # Find highly correlated pairs + upper_triangle = corr_matrix.where( + np.triu(np.ones(corr_matrix.shape), k=1).astype(bool) + ) + + # Find features to drop + to_drop = [column for column in upper_triangle.columns + if any(upper_triangle[column] > threshold)] + + # Features to keep + features_to_keep = [col for col in X.columns if col not in to_drop] + + self.selected_features['low_correlation'] = features_to_keep + + print(f"Removed {len(to_drop)} highly correlated features (threshold={threshold})") + return features_to_keep + + def select_variance_threshold_features(self, X: pd.DataFrame, + threshold: float = 0.0) -> List[str]: + """ + Remove features with low variance. + + Args: + X: Feature matrix + threshold: Variance threshold + + Returns: + List of features with variance above threshold + """ + from sklearn.feature_selection import VarianceThreshold + + # Only consider numerical columns + numerical_cols = X.select_dtypes(include=[np.number]).columns + + if len(numerical_cols) == 0: + print("No numerical columns for variance threshold selection") + return X.columns.tolist() + + selector = VarianceThreshold(threshold=threshold) + selector.fit(X[numerical_cols]) + + # Get selected numerical features + selected_numerical = numerical_cols[selector.get_support()].tolist() + + # Include all categorical features + categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist() + selected_features = selected_numerical + categorical_cols + + removed_count = len(numerical_cols) - len(selected_numerical) + self.selected_features['variance_threshold'] = selected_features + + print(f"Removed {removed_count} low-variance features (threshold={threshold})") + return selected_features + + def get_feature_ranking(self, method: str = 'k_best') -> pd.DataFrame: + """ + Get feature ranking based on selection method. + + Args: + method: Selection method ('k_best', 'mutual_info') + + Returns: + DataFrame with feature rankings + """ + if method not in self.selection_scores: + raise ValueError(f"Scores for method '{method}' not available. Run selection first.") + + scores = self.selection_scores[method] + ranking_df = pd.DataFrame({ + 'feature': list(scores.keys()), + 'score': list(scores.values()) + }).sort_values('score', ascending=False).reset_index(drop=True) + + ranking_df['rank'] = ranking_df.index + 1 + return ranking_df + + +class DataTransformer: + """Advanced data transformation utilities.""" + + def __init__(self, random_state: int = 42): + self.random_state = random_state + self.transformers = {} + self.transformation_log = [] + + def handle_outliers(self, data: pd.DataFrame, columns: List[str], + method: str = 'iqr', factor: float = 1.5) -> pd.DataFrame: + """ + Handle outliers in specified columns. + + Args: + data: Input DataFrame + columns: Columns to handle outliers for + method: Method to use ('iqr', 'zscore', 'clip') + factor: Factor for outlier detection + + Returns: + DataFrame with outliers handled + """ + result_df = data.copy() + outlier_info = {} + + for col in columns: + if not pd.api.types.is_numeric_dtype(data[col]): + continue + + original_count = (~data[col].isna()).sum() + + if method == 'iqr': + Q1 = data[col].quantile(0.25) + Q3 = data[col].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - factor * IQR + upper_bound = Q3 + factor * IQR + + # Clip outliers + result_df[col] = result_df[col].clip(lower=lower_bound, upper=upper_bound) + + elif method == 'zscore': + mean_val = data[col].mean() + std_val = data[col].std() + lower_bound = mean_val - factor * std_val + upper_bound = mean_val + factor * std_val + + # Clip outliers + result_df[col] = result_df[col].clip(lower=lower_bound, upper=upper_bound) + + elif method == 'clip': + # Clip at percentiles + lower_percentile = (1 - 0.99) / 2 * 100 + upper_percentile = 100 - lower_percentile + + lower_bound = data[col].quantile(lower_percentile / 100) + upper_bound = data[col].quantile(upper_percentile / 100) + + result_df[col] = result_df[col].clip(lower=lower_bound, upper=upper_bound) + + # Count outliers handled + outliers_handled = ( + (data[col] < result_df[col]).sum() + + (data[col] > result_df[col]).sum() + ) + + outlier_info[col] = { + 'outliers_handled': outliers_handled, + 'percentage': (outliers_handled / original_count) * 100 if original_count > 0 else 0 + } + + total_outliers = sum(info['outliers_handled'] for info in outlier_info.values()) + self.transformation_log.append(f"Handled {total_outliers} outliers using {method} method") + + print(f"Handled outliers in {len(columns)} columns using {method} method") + return result_df + + def apply_log_transform(self, data: pd.DataFrame, columns: List[str]) -> pd.DataFrame: + """ + Apply log transformation to specified columns. + + Args: + data: Input DataFrame + columns: Columns to transform + + Returns: + DataFrame with log-transformed columns + """ + result_df = data.copy() + transformed_cols = [] + + for col in columns: + if not pd.api.types.is_numeric_dtype(data[col]): + continue + + # Check if all values are positive + if (data[col] <= 0).any(): + print(f"Column {col} contains non-positive values, using log1p instead") + result_df[f"{col}_log"] = np.log1p(data[col]) + else: + result_df[f"{col}_log"] = np.log(data[col]) + + transformed_cols.append(f"{col}_log") + + self.transformation_log.append(f"Applied log transformation to {len(transformed_cols)} columns") + print(f"Applied log transformation to {len(transformed_cols)} columns") + return result_df + + def apply_box_cox_transform(self, data: pd.DataFrame, columns: List[str]) -> pd.DataFrame: + """ + Apply Box-Cox transformation to specified columns. + + Args: + data: Input DataFrame + columns: Columns to transform + + Returns: + DataFrame with Box-Cox transformed columns + """ + from scipy.stats import boxcox + + result_df = data.copy() + transformed_cols = [] + + for col in columns: + if not pd.api.types.is_numeric_dtype(data[col]): + continue + + # Box-Cox requires positive values + if (data[col] <= 0).any(): + print(f"Column {col} contains non-positive values, skipping Box-Cox") + continue + + try: + transformed_data, lambda_param = boxcox(data[col].dropna()) + + # Apply transformation to full column + result_df[f"{col}_boxcox"] = boxcox(data[col], lmbda=lambda_param) + transformed_cols.append(f"{col}_boxcox") + + # Store transformation parameters + self.transformers[f"{col}_boxcox_lambda"] = lambda_param + + except Exception as e: + print(f"Could not apply Box-Cox to {col}: {e}") + continue + + self.transformation_log.append(f"Applied Box-Cox transformation to {len(transformed_cols)} columns") + print(f"Applied Box-Cox transformation to {len(transformed_cols)} columns") + return result_df + + def create_target_encoding(self, data: pd.DataFrame, categorical_col: str, + target_col: str, smoothing: float = 10.0) -> pd.DataFrame: + """ + Create target encoding for a categorical column. + + Args: + data: Input DataFrame + categorical_col: Categorical column to encode + target_col: Target column + smoothing: Smoothing factor + + Returns: + DataFrame with target encoding added + """ + result_df = data.copy() + + # Calculate global mean + global_mean = data[target_col].mean() + + # Calculate category means and counts + category_stats = data.groupby(categorical_col)[target_col].agg(['mean', 'count']) + + # Apply smoothing + smoothed_means = ( + (category_stats['mean'] * category_stats['count'] + global_mean * smoothing) / + (category_stats['count'] + smoothing) + ) + + # Create encoded column + encoded_col_name = f"{categorical_col}_target_encoded" + result_df[encoded_col_name] = result_df[categorical_col].map(smoothed_means) + + # Fill missing values with global mean + result_df[encoded_col_name].fillna(global_mean, inplace=True) + + # Store encoding mapping + self.transformers[f"{categorical_col}_target_encoding"] = smoothed_means.to_dict() + + self.transformation_log.append(f"Created target encoding for {categorical_col}") + print(f"Created target encoding for {categorical_col}") + return result_df + + +class DataProfiler: + """Comprehensive data profiling utilities.""" + + def __init__(self): + pass + + def generate_data_profile(self, data: pd.DataFrame, target_col: str = None) -> Dict[str, Any]: + """ + Generate comprehensive data profile. + + Args: + data: Input DataFrame + target_col: Target column name (optional) + + Returns: + Dictionary with comprehensive data profile + """ + profile = { + 'overview': self._get_overview(data), + 'columns': self._get_column_profiles(data), + 'correlations': self._get_correlation_analysis(data), + 'missing_data': self._get_missing_data_analysis(data), + 'duplicates': self._get_duplicate_analysis(data), + 'data_quality': self._assess_data_quality(data) + } + + if target_col and target_col in data.columns: + profile['target_analysis'] = self._get_target_analysis(data, target_col) + + return profile + + def _get_overview(self, data: pd.DataFrame) -> Dict[str, Any]: + """Get basic overview of the dataset.""" + return { + 'shape': data.shape, + 'memory_usage_mb': data.memory_usage(deep=True).sum() / (1024 * 1024), + 'dtypes': data.dtypes.value_counts().to_dict(), + 'numerical_columns': data.select_dtypes(include=[np.number]).columns.tolist(), + 'categorical_columns': data.select_dtypes(exclude=[np.number]).columns.tolist() + } + + def _get_column_profiles(self, data: pd.DataFrame) -> Dict[str, Dict[str, Any]]: + """Get detailed profile for each column.""" + profiles = {} + + for col in data.columns: + col_data = data[col] + + base_profile = { + 'dtype': str(col_data.dtype), + 'missing_count': int(col_data.isnull().sum()), + 'missing_percentage': float((col_data.isnull().sum() / len(col_data)) * 100), + 'unique_count': int(col_data.nunique()), + 'unique_percentage': float((col_data.nunique() / len(col_data)) * 100) + } + + if pd.api.types.is_numeric_dtype(col_data): + # Numerical column profile + base_profile.update({ + 'mean': float(col_data.mean()) if not pd.isna(col_data.mean()) else None, + 'std': float(col_data.std()) if not pd.isna(col_data.std()) else None, + 'min': float(col_data.min()) if not pd.isna(col_data.min()) else None, + 'max': float(col_data.max()) if not pd.isna(col_data.max()) else None, + 'q25': float(col_data.quantile(0.25)) if not pd.isna(col_data.quantile(0.25)) else None, + 'q50': float(col_data.quantile(0.50)) if not pd.isna(col_data.quantile(0.50)) else None, + 'q75': float(col_data.quantile(0.75)) if not pd.isna(col_data.quantile(0.75)) else None, + 'skewness': float(col_data.skew()) if not pd.isna(col_data.skew()) else None, + 'kurtosis': float(col_data.kurtosis()) if not pd.isna(col_data.kurtosis()) else None, + 'zeros_count': int((col_data == 0).sum()), + 'zeros_percentage': float(((col_data == 0).sum() / len(col_data)) * 100) + }) + else: + # Categorical column profile + value_counts = col_data.value_counts() + base_profile.update({ + 'most_frequent': str(value_counts.index[0]) if len(value_counts) > 0 else None, + 'most_frequent_count': int(value_counts.iloc[0]) if len(value_counts) > 0 else 0, + 'least_frequent': str(value_counts.index[-1]) if len(value_counts) > 0 else None, + 'least_frequent_count': int(value_counts.iloc[-1]) if len(value_counts) > 0 else 0, + 'top_5_values': {str(k): int(v) for k, v in value_counts.head(5).items()} + }) + + profiles[col] = base_profile + + return profiles + + def _get_correlation_analysis(self, data: pd.DataFrame) -> Dict[str, Any]: + """Analyze correlations between numerical columns.""" + numerical_data = data.select_dtypes(include=[np.number]) + + if numerical_data.empty: + return {'message': 'No numerical columns for correlation analysis'} + + corr_matrix = numerical_data.corr() + + # Find highly correlated pairs + high_corr_pairs = [] + for i in range(len(corr_matrix.columns)): + for j in range(i+1, len(corr_matrix.columns)): + corr_value = corr_matrix.iloc[i, j] + if abs(corr_value) > 0.7: # High correlation threshold + high_corr_pairs.append({ + 'feature1': str(corr_matrix.columns[i]), + 'feature2': str(corr_matrix.columns[j]), + 'correlation': float(corr_value) + }) + + return { + 'correlation_matrix': {str(k): {str(k2): float(v2) for k2, v2 in v.items()} for k, v in corr_matrix.to_dict().items()}, + 'high_correlations': high_corr_pairs, + 'max_correlation': float(abs(corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]).max()), + 'mean_correlation': float(abs(corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]).mean()) + } + + def _get_missing_data_analysis(self, data: pd.DataFrame) -> Dict[str, Any]: + """Analyze missing data patterns.""" + missing_counts = data.isnull().sum() + missing_percentages = (missing_counts / len(data)) * 100 + + return { + 'total_missing_cells': int(missing_counts.sum()), + 'missing_percentage_overall': float((missing_counts.sum() / data.size) * 100), + 'columns_with_missing': {str(k): int(v) for k, v in missing_counts[missing_counts > 0].items()}, + 'missing_percentages': {str(k): float(v) for k, v in missing_percentages[missing_percentages > 0].items()}, + 'complete_rows': int(len(data) - data.isnull().any(axis=1).sum()), + 'complete_rows_percentage': float(((len(data) - data.isnull().any(axis=1).sum()) / len(data)) * 100) + } + + def _get_duplicate_analysis(self, data: pd.DataFrame) -> Dict[str, Any]: + """Analyze duplicate data.""" + duplicate_rows = data.duplicated().sum() + + return { + 'duplicate_rows': int(duplicate_rows), + 'duplicate_percentage': float((duplicate_rows / len(data)) * 100), + 'unique_rows': int(len(data) - duplicate_rows) + } + + def _assess_data_quality(self, data: pd.DataFrame) -> Dict[str, float]: + """Assess overall data quality.""" + # Completeness + completeness = 1 - (data.isnull().sum().sum() / data.size) + + # Uniqueness + uniqueness = 1 - (data.duplicated().sum() / len(data)) + + # Consistency (based on mixed types in object columns) + consistency_issues = 0 + object_cols = data.select_dtypes(include=['object']).columns + for col in object_cols: + # Simple check for mixed types (numbers in string columns) + try: + numeric_count = pd.to_numeric(data[col].dropna(), errors='coerce').notna().sum() + total_non_null = data[col].notna().sum() + if total_non_null > 0 and 0 < numeric_count < total_non_null: + consistency_issues += 1 + except: + pass + + consistency = 1 - (consistency_issues / len(object_cols)) if len(object_cols) > 0 else 1.0 + + # Overall quality score + overall = np.mean([completeness, uniqueness, consistency]) + + return { + 'completeness': float(completeness), + 'uniqueness': float(uniqueness), + 'consistency': float(consistency), + 'overall_quality': float(overall) + } + + def _get_target_analysis(self, data: pd.DataFrame, target_col: str) -> Dict[str, Any]: + """Analyze target column specifically.""" + target_data = data[target_col] + + analysis = { + 'dtype': str(target_data.dtype), + 'missing_count': int(target_data.isnull().sum()), + 'unique_count': int(target_data.nunique()), + 'distribution': {str(k): int(v) for k, v in target_data.value_counts().items()} + } + + if pd.api.types.is_numeric_dtype(target_data): + analysis.update({ + 'mean': float(target_data.mean()) if not pd.isna(target_data.mean()) else None, + 'std': float(target_data.std()) if not pd.isna(target_data.std()) else None, + 'min': float(target_data.min()) if not pd.isna(target_data.min()) else None, + 'max': float(target_data.max()) if not pd.isna(target_data.max()) else None, + 'skewness': float(target_data.skew()) if not pd.isna(target_data.skew()) else None, + 'recommended_task': 'regression' if target_data.nunique() > 20 else 'classification' + }) + else: + analysis['recommended_task'] = 'classification' + + return analysis + + def save_profile_report(self, profile: Dict[str, Any], output_path: str = "data_profile_report.json") -> str: + """Save data profile to JSON file with proper serialization.""" + # Convert numpy types to native Python types for JSON serialization + def convert_types(obj): + if isinstance(obj, (np.integer, np.int64, np.int32, np.int16, np.int8)): + return int(obj) + elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, pd.Timestamp): + return obj.isoformat() + elif isinstance(obj, (pd.CategoricalDtype, pd.StringDtype)): + return str(obj) + elif hasattr(obj, 'dtype') and hasattr(obj.dtype, 'name'): + return str(obj.dtype.name) + elif pd.isna(obj): + return None + elif isinstance(obj, (pd.Series, pd.Index)): + return obj.tolist() + return obj + + def clean_dict(d): + if isinstance(d, dict): + cleaned = {} + for k, v in d.items(): + # Convert keys to strings if they aren't already + clean_key = str(k) if not isinstance(k, (str, int, float, bool, type(None))) else k + cleaned[clean_key] = clean_dict(v) + return cleaned + elif isinstance(d, list): + return [clean_dict(v) for v in d] + else: + return convert_types(d) + + clean_profile = clean_dict(profile) + + try: + with open(output_path, 'w') as f: + json.dump(clean_profile, f, indent=2, default=str) + + print(f"Data profile report saved: {output_path}") + return output_path + except Exception as e: + print(f"Error saving profile report: {e}") + # Try saving a simplified version + simplified_profile = { + 'overview': clean_profile.get('overview', {}), + 'data_quality': clean_profile.get('data_quality', {}), + 'missing_data': clean_profile.get('missing_data', {}) + } + + try: + simplified_path = output_path.replace('.json', '_simplified.json') + with open(simplified_path, 'w') as f: + json.dump(simplified_profile, f, indent=2, default=str) + print(f"Simplified profile report saved: {simplified_path}") + return simplified_path + except Exception as e2: + print(f"Could not save profile report: {e2}") + return "" \ No newline at end of file diff --git a/Structured_data_template/train/src/utils/parse_config.py b/Structured_data_template/train/src/utils/parse_config.py new file mode 100644 index 0000000..072ce89 --- /dev/null +++ b/Structured_data_template/train/src/utils/parse_config.py @@ -0,0 +1,242 @@ +import configparser +import os +import json +from typing import Dict, Any, Tuple, Union, List + +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from xgboost import XGBClassifier, XGBRegressor +from sklearn.linear_model import LinearRegression, LogisticRegression + + +def load_config(config_path: str) -> configparser.ConfigParser: + """Load configuration from .cfg file.""" + if not os.path.exists(config_path): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + + config = configparser.ConfigParser() + config.read(config_path) + return config + + +def get_model_and_hyperparams(config: configparser.ConfigParser) -> Tuple[Any, Dict[str, Any]]: + """ + Retrieves the model class and its hyperparameters from the configuration. + + Args: + config: The loaded configuration object. + + Returns: + tuple: Model class and dictionary of hyperparameters. + + Raises: + ValueError: If unsupported model or invalid task combination. + """ + model_name = config["model"]["model_name"].strip().lower() + task = config["model"]["task"].strip().lower() + + # Model mapping + model_mapping = { + "random_forest": { + "classification": RandomForestClassifier, + "regression": RandomForestRegressor + }, + "xgboost": { + "classification": XGBClassifier, + "regression": XGBRegressor + }, + "linear_regression": { + "regression": LinearRegression + }, + "logistic_regression": { + "classification": LogisticRegression + } + } + + if model_name not in model_mapping: + raise ValueError(f"Unsupported model: {model_name}") + + if task not in model_mapping[model_name]: + raise ValueError(f"Model {model_name} does not support task: {task}") + + model_class = model_mapping[model_name][task] + + # Get hyperparameters + hyperparams_section = f"hyperparameters.{model_name}" + if hyperparams_section not in config: + return model_class, {} + + hyperparams = {} + for key, value in config[hyperparams_section].items(): + # Convert string values to appropriate types + hyperparams[key] = _convert_config_value(value) + + return model_class, hyperparams + + +def get_data_config(config: configparser.ConfigParser) -> Dict[str, Any]: + """ + Retrieves data-related configurations. + + Args: + config: The loaded configuration object. + + Returns: + dict: Data configuration settings. + """ + data_section = config["data"] + + data_config = { + "data_path": data_section["data_path"], + "target_column": data_section["target_column"], + "visualise_data": data_section.getboolean("visualise_data"), + "check_imbalance": data_section.getboolean("check_imbalance"), + "test_size": data_section.getfloat("test_size"), + "random_state": data_section.getint("random_state"), + "stratify": data_section.getboolean("stratify"), + "columns_to_drop": _parse_comma_separated(data_section.get("columns_to_drop", "")) + } + + # Parse preprocessing settings + preprocessing_settings = _parse_preprocessing_config(config) + data_config["preprocessor_settings"] = preprocessing_settings + + return data_config + + +def _parse_preprocessing_config(config: configparser.ConfigParser) -> Dict[str, Dict[str, Dict[str, Any]]]: + """Parse preprocessing configuration from config sections.""" + preprocessing_settings = {"categorical": {}, "numerical": {}} + + # Parse categorical preprocessing + if "preprocessing.categorical" in config: + for key in config["preprocessing.categorical"]: + column_name, setting_type = key.split(".", 1) + + if column_name not in preprocessing_settings["categorical"]: + preprocessing_settings["categorical"][column_name] = {} + + value = config["preprocessing.categorical"][key] + if setting_type == "encoder_options": + try: + preprocessing_settings["categorical"][column_name][setting_type] = json.loads(value) + except json.JSONDecodeError: + preprocessing_settings["categorical"][column_name][setting_type] = {} + else: + preprocessing_settings["categorical"][column_name][setting_type] = value + + # Parse numerical preprocessing + if "preprocessing.numerical" in config: + for key in config["preprocessing.numerical"]: + column_name, setting_type = key.split(".", 1) + + if column_name not in preprocessing_settings["numerical"]: + preprocessing_settings["numerical"][column_name] = {} + + value = config["preprocessing.numerical"][key] + preprocessing_settings["numerical"][column_name][setting_type] = value + + return preprocessing_settings + + +def get_logging_config(config: configparser.ConfigParser) -> Dict[str, Any]: + """ + Retrieves logging-related configurations. + + Args: + config: The loaded configuration object. + + Returns: + dict: Logging configuration settings. + """ + logging_section = config["logging"] + + return { + "save_model": logging_section.getboolean("save_model"), + "model_output_path": logging_section["model_output_path"], + "mlflow_tracking_uri": logging_section["mlflow_tracking_uri"] + } + + +def get_training_config(config: configparser.ConfigParser) -> Dict[str, Any]: + """ + Retrieves training-related configurations. + + Args: + config: The loaded configuration object. + + Returns: + dict: Training configuration settings. + """ + training_section = config["training"] + + return { + "use_kfold_cv": training_section.getboolean("use_kfold_cv"), + "n_splits": training_section.getint("n_splits"), + "shuffle_cv": training_section.getboolean("shuffle_cv"), + "random_state_cv": training_section.getint("random_state_cv"), + "use_optuna": training_section.getboolean("use_optuna"), + "n_trials": training_section.getint("n_trials"), + "timeout": training_section.getint("timeout"), + "optuna_direction": training_section["optuna_direction"] + } + + +def _convert_config_value(value: str) -> Union[str, int, float, bool]: + """Convert string configuration value to appropriate Python type.""" + value = value.strip() + + # Boolean conversion + if value.lower() in ('true', 'false'): + return value.lower() == 'true' + + # Integer conversion + try: + if '.' not in value: + return int(value) + except ValueError: + pass + + # Float conversion + try: + return float(value) + except ValueError: + pass + + # Return as string + return value + + +def _parse_comma_separated(value: str) -> List[str]: + """Parse comma-separated string into list of stripped strings.""" + if not value or not value.strip(): + return [] + return [item.strip() for item in value.split(',') if item.strip()] + + +def validate_config(config: configparser.ConfigParser) -> None: + """ + Validate configuration settings. + + Args: + config: The loaded configuration object. + + Raises: + ValueError: If configuration is invalid. + """ + required_sections = ["data", "model", "logging", "training"] + for section in required_sections: + if section not in config: + raise ValueError(f"Required section '{section}' missing from config") + + # Validate data path exists + data_path = config["data"]["data_path"] + if not os.path.exists(data_path): + raise ValueError(f"Data file not found: {data_path}") + + # Validate model and task combination + try: + get_model_and_hyperparams(config) + except ValueError as e: + raise ValueError(f"Invalid model configuration: {e}") + + print("Configuration validation passed.") \ No newline at end of file diff --git a/Structured_data_template/train/src/utils/visualise.py b/Structured_data_template/train/src/utils/visualise.py new file mode 100644 index 0000000..cfa8878 --- /dev/null +++ b/Structured_data_template/train/src/utils/visualise.py @@ -0,0 +1,661 @@ +""" +Visualization utilities for data exploration and model evaluation. +Fixed target correlation calculation for categorical targets. +""" + +import matplotlib.pyplot as plt +import matplotlib +import seaborn as sns +import pandas as pd +import numpy as np +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple, Union +import warnings +from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc +from sklearn.model_selection import validation_curve, learning_curve + + +class DataVisualizer: + """Comprehensive data visualization utilities.""" + + def __init__(self, output_dir: str = "plots", style: str = "whitegrid", + figsize: Tuple[int, int] = (10, 6), dpi: int = 300): + """ + Initialize visualizer. + + Args: + output_dir: Directory to save plots + style: Seaborn style + figsize: Default figure size + dpi: Plot resolution + """ + self.output_dir = Path(output_dir) + self.output_dir.mkdir(exist_ok=True) + self.figsize = figsize + self.dpi = dpi + + # Set style + plt.style.use('default') + sns.set_style(style) + + # Try to set interactive backend + self._setup_backend() + + def _setup_backend(self): + """Setup matplotlib backend for optimal display.""" + try: + # Try interactive backends + for backend in ['TkAgg', 'Qt5Agg', 'QtAgg']: + try: + matplotlib.use(backend) + break + except ImportError: + continue + else: + # Fallback to non-interactive + matplotlib.use('Agg') + print("Using non-interactive backend. Plots will be saved only.") + except Exception: + print("Backend setup failed. Using default.") + + def plot_distributions(self, data: pd.DataFrame, columns: Optional[List[str]] = None, + target_column: Optional[str] = None, + save_name: str = "distributions") -> None: + """ + Create comprehensive distribution plots. + + Args: + data: DataFrame to plot + columns: Specific columns to plot (None for all) + target_column: Target column for colored distributions + save_name: Base name for saved plots + """ + if columns is None: + numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist() + categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist() + else: + numerical_cols = [col for col in columns if pd.api.types.is_numeric_dtype(data[col])] + categorical_cols = [col for col in columns if col not in numerical_cols] + + # Remove target from feature columns + if target_column: + numerical_cols = [col for col in numerical_cols if col != target_column] + categorical_cols = [col for col in categorical_cols if col != target_column] + + # Plot numerical distributions + if numerical_cols: + self._plot_numerical_distributions(data, numerical_cols, target_column, + f"{save_name}_numerical") + + # Plot categorical distributions + if categorical_cols: + self._plot_categorical_distributions(data, categorical_cols, target_column, + f"{save_name}_categorical") + + # Plot target distribution separately + if target_column and target_column in data.columns: + self._plot_target_distribution(data, target_column, f"{save_name}_target") + + def _plot_numerical_distributions(self, data: pd.DataFrame, columns: List[str], + target_column: Optional[str], save_name: str) -> None: + """Plot numerical feature distributions.""" + n_cols = min(3, len(columns)) + n_rows = (len(columns) + n_cols - 1) // n_cols + + fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows)) + if n_rows * n_cols == 1: + axes = [axes] + elif n_rows == 1: + axes = [axes] if not isinstance(axes, np.ndarray) else axes + else: + axes = axes.flatten() + + for i, col in enumerate(columns): + ax = axes[i] + + if target_column and target_column in data.columns: + # Colored by target + unique_targets = data[target_column].unique() + if len(unique_targets) <= 10: # Only if not too many classes + for target_val in unique_targets: + subset = data[data[target_column] == target_val] + if len(subset) > 0: + sns.histplot(subset[col], alpha=0.7, label=f"{target_column}={target_val}", + kde=True, ax=ax) + ax.legend() + else: + sns.histplot(data[col], kde=True, ax=ax) + else: + sns.histplot(data[col], kde=True, ax=ax) + + # Add statistics + mean_val = data[col].mean() + std_val = data[col].std() + ax.axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'μ={mean_val:.2f}') + ax.text(0.02, 0.98, f'μ={mean_val:.2f}\nσ={std_val:.2f}', + transform=ax.transAxes, verticalalignment='top', + bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8)) + + ax.set_title(f'Distribution of {col}', fontweight='bold') + ax.grid(True, alpha=0.3) + + # Hide empty subplots + for i in range(len(columns), len(axes)): + axes[i].set_visible(False) + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def _plot_categorical_distributions(self, data: pd.DataFrame, columns: List[str], + target_column: Optional[str], save_name: str) -> None: + """Plot categorical feature distributions.""" + n_cols = min(2, len(columns)) + n_rows = (len(columns) + n_cols - 1) // n_cols + + fig, axes = plt.subplots(n_rows, n_cols, figsize=(6 * n_cols, 4 * n_rows)) + if n_rows * n_cols == 1: + axes = [axes] + elif n_rows == 1: + axes = [axes] if not isinstance(axes, np.ndarray) else axes + else: + axes = axes.flatten() + + for i, col in enumerate(columns): + ax = axes[i] + + # Get top categories to avoid overcrowding + value_counts = data[col].value_counts().head(15) + + if target_column and target_column in data.columns: + # Stacked bar plot by target + subset_data = data[data[col].isin(value_counts.index)] + if len(subset_data) > 0: + pd.crosstab(subset_data[col], subset_data[target_column]).plot(kind='bar', + stacked=True, ax=ax, rot=45) + ax.legend(title=target_column, bbox_to_anchor=(1.05, 1), loc='upper left') + else: + # Simple count plot + sns.countplot(data=data[data[col].isin(value_counts.index)], + x=col, order=value_counts.index, ax=ax) + ax.tick_params(axis='x', rotation=45) + + # Add count annotations + for p in ax.patches: + height = p.get_height() + if height > 0: + ax.annotate(f'{int(height)}', (p.get_x() + p.get_width()/2., height), + ha='center', va='bottom', fontsize=8) + + ax.set_title(f'Distribution of {col}', fontweight='bold') + ax.grid(True, alpha=0.3, axis='y') + + # Add unique count info + unique_count = data[col].nunique() + ax.text(0.02, 0.98, f'Unique: {unique_count}', + transform=ax.transAxes, verticalalignment='top', + bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8)) + + # Hide empty subplots + for i in range(len(columns), len(axes)): + axes[i].set_visible(False) + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def _plot_target_distribution(self, data: pd.DataFrame, target_column: str, + save_name: str) -> None: + """Plot target variable distribution.""" + fig, axes = plt.subplots(1, 2, figsize=(15, 6)) + + target_data = data[target_column] + + # Left plot: Basic distribution + if target_data.dtype in ['object', 'category'] or target_data.nunique() <= 20: + # Categorical target + sns.countplot(data=data, x=target_column, ax=axes[0]) + axes[0].set_title(f'Target Distribution: {target_column}') + axes[0].tick_params(axis='x', rotation=45) + + # Add percentage labels + total = len(target_data) + for p in axes[0].patches: + percentage = f'{100 * p.get_height() / total:.1f}%' + axes[0].annotate(percentage, (p.get_x() + p.get_width()/2., p.get_height()), + ha='center', va='bottom') + else: + # Numerical target + sns.histplot(data=data, x=target_column, kde=True, ax=axes[0]) + axes[0].set_title(f'Target Distribution: {target_column}') + + # Add statistics + mean_val = target_data.mean() + std_val = target_data.std() + axes[0].axvline(mean_val, color='red', linestyle='--', alpha=0.7) + axes[0].text(0.02, 0.98, f'μ={mean_val:.2f}\nσ={std_val:.2f}', + transform=axes[0].transAxes, verticalalignment='top', + bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.8)) + + # Right plot: Target vs top features relationship + numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist() + if target_column in numerical_cols: + numerical_cols.remove(target_column) + + if numerical_cols and pd.api.types.is_numeric_dtype(data[target_column]): + # Correlation heatmap with numerical target (only if target is also numerical) + try: + target_corr = data[numerical_cols + [target_column]].corr()[target_column].drop(target_column) + top_corr = target_corr.abs().nlargest(min(10, len(target_corr))) + + sns.barplot(x=top_corr.values, y=top_corr.index, ax=axes[1]) + axes[1].set_title(f'Top Features Correlated with {target_column}') + axes[1].set_xlabel('Correlation Coefficient') + except Exception as e: + axes[1].text(0.5, 0.5, f'Could not compute correlations\n({str(e)})', + ha='center', va='center', transform=axes[1].transAxes) + axes[1].set_title('Correlation Analysis') + elif numerical_cols: + # For categorical target, show feature importance or basic stats + try: + # Show distribution of first numerical feature by target categories + if len(numerical_cols) > 0: + first_num_col = numerical_cols[0] + for target_val in data[target_column].unique()[:5]: # Limit to 5 categories + subset = data[data[target_column] == target_val] + if len(subset) > 0: + sns.histplot(subset[first_num_col], alpha=0.6, + label=f'{target_column}={target_val}', ax=axes[1]) + axes[1].legend() + axes[1].set_title(f'{first_num_col} Distribution by {target_column}') + else: + axes[1].text(0.5, 0.5, 'No numerical features\nfor analysis', + ha='center', va='center', transform=axes[1].transAxes) + axes[1].set_title('Feature Analysis') + except Exception as e: + axes[1].text(0.5, 0.5, f'Analysis not available\n({str(e)})', + ha='center', va='center', transform=axes[1].transAxes) + axes[1].set_title('Feature Analysis') + else: + axes[1].text(0.5, 0.5, 'No numerical features\nfor correlation analysis', + ha='center', va='center', transform=axes[1].transAxes) + axes[1].set_title('Correlation Analysis') + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def plot_correlation_matrix(self, data: pd.DataFrame, method: str = 'pearson', + save_name: str = "correlation_matrix") -> None: + """Create correlation matrix heatmap.""" + numerical_data = data.select_dtypes(include=[np.number]) + + if numerical_data.empty: + print("No numerical columns for correlation analysis") + return + + # Calculate correlation matrix + corr_matrix = numerical_data.corr(method=method) + + # Create heatmap + fig, ax = plt.subplots(figsize=(max(8, len(corr_matrix.columns)), + max(6, len(corr_matrix.columns) * 0.8))) + + mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) # Mask upper triangle + + sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0, + square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax) + + ax.set_title(f'Feature Correlation Matrix ({method.title()})', + fontsize=14, fontweight='bold') + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def plot_missing_data_pattern(self, data: pd.DataFrame, + save_name: str = "missing_data_pattern") -> None: + """Visualize missing data patterns.""" + missing_data = data.isnull() + + if not missing_data.any().any(): + print("No missing data to visualize") + return + + fig, axes = plt.subplots(2, 2, figsize=(15, 10)) + + # 1. Missing data heatmap + sns.heatmap(missing_data, yticklabels=False, cbar=True, cmap='viridis', ax=axes[0,0]) + axes[0,0].set_title('Missing Data Pattern') + + # 2. Missing data by column + missing_by_col = missing_data.sum().sort_values(ascending=False) + missing_by_col = missing_by_col[missing_by_col > 0] + + if not missing_by_col.empty: + sns.barplot(x=missing_by_col.values, y=missing_by_col.index, ax=axes[0,1]) + axes[0,1].set_title('Missing Data Count by Column') + axes[0,1].set_xlabel('Missing Count') + + # 3. Missing data percentage + missing_pct = (missing_by_col / len(data)) * 100 + if not missing_pct.empty: + sns.barplot(x=missing_pct.values, y=missing_pct.index, ax=axes[1,0]) + axes[1,0].set_title('Missing Data Percentage by Column') + axes[1,0].set_xlabel('Missing Percentage (%)') + + # 4. Missing data combinations + missing_combinations = missing_data.value_counts().head(10) + axes[1,1].bar(range(len(missing_combinations)), missing_combinations.values) + axes[1,1].set_title('Top Missing Data Combinations') + axes[1,1].set_xlabel('Pattern Index') + axes[1,1].set_ylabel('Count') + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def plot_outliers_analysis(self, data: pd.DataFrame, method: str = 'iqr', + save_name: str = "outliers_analysis") -> None: + """Analyze and visualize outliers.""" + numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist() + + if not numerical_cols: + print("No numerical columns for outlier analysis") + return + + n_cols = min(3, len(numerical_cols)) + n_rows = (len(numerical_cols) + n_cols - 1) // n_cols + + fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows)) + if len(numerical_cols) == 1: + axes = [axes] + elif n_rows == 1: + axes = [axes] if not isinstance(axes, np.ndarray) else axes + else: + axes = axes.flatten() + + outlier_summary = {} + + for i, col in enumerate(numerical_cols): + ax = axes[i] + + # Box plot + sns.boxplot(data=data, y=col, ax=ax) + ax.set_title(f'Outliers in {col}') + + # Calculate outliers using IQR method + Q1 = data[col].quantile(0.25) + Q3 = data[col].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + + outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)] + outlier_count = len(outliers) + outlier_pct = (outlier_count / len(data)) * 100 + + outlier_summary[col] = { + 'count': outlier_count, + 'percentage': outlier_pct, + 'lower_bound': lower_bound, + 'upper_bound': upper_bound + } + + # Add outlier info + ax.text(0.02, 0.98, f'Outliers: {outlier_count}\n({outlier_pct:.1f}%)', + transform=ax.transAxes, verticalalignment='top', + bbox=dict(boxstyle='round', facecolor='orange', alpha=0.8)) + + # Hide empty subplots + for i in range(len(numerical_cols), len(axes)): + axes[i].set_visible(False) + + plt.tight_layout() + self._save_and_show(fig, save_name) + + # Print outlier summary + print("\nOUTLIER ANALYSIS SUMMARY:") + for col, info in outlier_summary.items(): + print(f" {col}: {info['count']} outliers ({info['percentage']:.1f}%)") + + def plot_feature_importance(self, feature_names: List[str], importance_scores: np.ndarray, + title: str = "Feature Importance", + save_name: str = "feature_importance") -> None: + """Plot feature importance scores.""" + # Create DataFrame and sort + importance_df = pd.DataFrame({ + 'feature': feature_names, + 'importance': importance_scores + }).sort_values('importance', ascending=True) + + # Plot + fig, ax = plt.subplots(figsize=(10, max(6, len(feature_names) * 0.3))) + + bars = sns.barplot(data=importance_df, x='importance', y='feature', ax=ax) + ax.set_title(title, fontsize=14, fontweight='bold') + ax.set_xlabel('Importance Score') + + # Add value labels on bars + for i, bar in enumerate(bars.patches): + width = bar.get_width() + ax.text(width, bar.get_y() + bar.get_height()/2, + f'{width:.3f}', ha='left', va='center', fontsize=8) + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def _save_and_show(self, fig: plt.Figure, save_name: str) -> None: + """Save figure and optionally display it.""" + # Save figure + save_path = self.output_dir / f"{save_name}.png" + fig.savefig(save_path, dpi=self.dpi, bbox_inches='tight', facecolor='white') + print(f"Plot saved: {save_path}") + + # Try to display + try: + plt.show(block=False) + print("Plot displayed") + except Exception: + print("Plot saved (display not available)") + + plt.close(fig) + + +class ModelVisualizer: + """Visualization utilities for model evaluation and performance.""" + + def __init__(self, output_dir: str = "plots/model_evaluation", + figsize: Tuple[int, int] = (10, 6), dpi: int = 300): + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + self.figsize = figsize + self.dpi = dpi + + def plot_confusion_matrix(self, y_true: np.ndarray, y_pred: np.ndarray, + class_names: Optional[List[str]] = None, + save_name: str = "confusion_matrix") -> None: + """Plot confusion matrix.""" + cm = confusion_matrix(y_true, y_pred) + + fig, ax = plt.subplots(figsize=self.figsize) + + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, + xticklabels=class_names, yticklabels=class_names) + + ax.set_title('Confusion Matrix', fontsize=14, fontweight='bold') + ax.set_xlabel('Predicted Label') + ax.set_ylabel('True Label') + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def plot_roc_curves(self, y_true: np.ndarray, y_proba: np.ndarray, + class_names: Optional[List[str]] = None, + save_name: str = "roc_curves") -> None: + """Plot ROC curves for multiclass classification.""" + from sklearn.preprocessing import label_binarize + from sklearn.metrics import roc_curve, auc + from itertools import cycle + + n_classes = y_proba.shape[1] if len(y_proba.shape) > 1 else 2 + + if n_classes == 2: + # Binary classification + fpr, tpr, _ = roc_curve(y_true, y_proba[:, 1] if len(y_proba.shape) > 1 else y_proba) + roc_auc = auc(fpr, tpr) + + fig, ax = plt.subplots(figsize=self.figsize) + ax.plot(fpr, tpr, lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})') + ax.plot([0, 1], [0, 1], 'k--', lw=2) + ax.set_xlim([0.0, 1.0]) + ax.set_ylim([0.0, 1.05]) + ax.set_xlabel('False Positive Rate') + ax.set_ylabel('True Positive Rate') + ax.set_title('ROC Curve') + ax.legend(loc="lower right") + + else: + # Multiclass + y_true_bin = label_binarize(y_true, classes=range(n_classes)) + + fig, ax = plt.subplots(figsize=self.figsize) + colors = cycle(['blue', 'red', 'green', 'orange', 'purple', 'brown']) + + for i, color in zip(range(n_classes), colors): + fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_proba[:, i]) + roc_auc = auc(fpr, tpr) + class_name = class_names[i] if class_names else f'Class {i}' + + ax.plot(fpr, tpr, color=color, lw=2, + label=f'{class_name} (AUC = {roc_auc:.2f})') + + ax.plot([0, 1], [0, 1], 'k--', lw=2) + ax.set_xlim([0.0, 1.0]) + ax.set_ylim([0.0, 1.05]) + ax.set_xlabel('False Positive Rate') + ax.set_ylabel('True Positive Rate') + ax.set_title('Multi-class ROC Curves') + ax.legend(loc="lower right") + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def plot_learning_curves(self, estimator, X: np.ndarray, y: np.ndarray, + cv: int = 5, scoring: str = 'accuracy', + save_name: str = "learning_curves") -> None: + """Plot learning curves to diagnose bias/variance.""" + train_sizes, train_scores, val_scores = learning_curve( + estimator, X, y, cv=cv, scoring=scoring, n_jobs=-1, + train_sizes=np.linspace(0.1, 1.0, 10)) + + train_mean = np.mean(train_scores, axis=1) + train_std = np.std(train_scores, axis=1) + val_mean = np.mean(val_scores, axis=1) + val_std = np.std(val_scores, axis=1) + + fig, ax = plt.subplots(figsize=self.figsize) + + ax.plot(train_sizes, train_mean, 'o-', color='blue', label='Training score') + ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, + alpha=0.1, color='blue') + + ax.plot(train_sizes, val_mean, 'o-', color='red', label='Cross-validation score') + ax.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, + alpha=0.1, color='red') + + ax.set_xlabel('Training Set Size') + ax.set_ylabel(f'{scoring.title()} Score') + ax.set_title('Learning Curves') + ax.legend(loc='best') + ax.grid(True, alpha=0.3) + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def plot_validation_curves(self, estimator, X: np.ndarray, y: np.ndarray, + param_name: str, param_range: np.ndarray, + cv: int = 5, scoring: str = 'accuracy', + save_name: str = "validation_curves") -> None: + """Plot validation curves for hyperparameter analysis.""" + train_scores, val_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range, + cv=cv, scoring=scoring, n_jobs=-1) + + train_mean = np.mean(train_scores, axis=1) + train_std = np.std(train_scores, axis=1) + val_mean = np.mean(val_scores, axis=1) + val_std = np.std(val_scores, axis=1) + + fig, ax = plt.subplots(figsize=self.figsize) + + ax.plot(param_range, train_mean, 'o-', color='blue', label='Training score') + ax.fill_between(param_range, train_mean - train_std, train_mean + train_std, + alpha=0.1, color='blue') + + ax.plot(param_range, val_mean, 'o-', color='red', label='Cross-validation score') + ax.fill_between(param_range, val_mean - val_std, val_mean + val_std, + alpha=0.1, color='red') + + ax.set_xlabel(param_name.replace('_', ' ').title()) + ax.set_ylabel(f'{scoring.title()} Score') + ax.set_title(f'Validation Curves for {param_name}') + ax.legend(loc='best') + ax.grid(True, alpha=0.3) + + # Use log scale if parameter values span multiple orders of magnitude + if param_range.max() / param_range.min() > 100: + ax.set_xscale('log') + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def plot_residuals(self, y_true: np.ndarray, y_pred: np.ndarray, + save_name: str = "residuals") -> None: + """Plot residuals for regression analysis.""" + residuals = y_true - y_pred + + fig, axes = plt.subplots(2, 2, figsize=(12, 10)) + + # 1. Residuals vs Predicted + axes[0,0].scatter(y_pred, residuals, alpha=0.6) + axes[0,0].axhline(y=0, color='red', linestyle='--') + axes[0,0].set_xlabel('Predicted Values') + axes[0,0].set_ylabel('Residuals') + axes[0,0].set_title('Residuals vs Predicted') + axes[0,0].grid(True, alpha=0.3) + + # 2. Residuals histogram + axes[0,1].hist(residuals, bins=30, alpha=0.7, edgecolor='black') + axes[0,1].set_xlabel('Residuals') + axes[0,1].set_ylabel('Frequency') + axes[0,1].set_title('Residuals Distribution') + axes[0,1].grid(True, alpha=0.3) + + # 3. Q-Q plot + from scipy import stats + stats.probplot(residuals, dist="norm", plot=axes[1,0]) + axes[1,0].set_title('Q-Q Plot') + axes[1,0].grid(True, alpha=0.3) + + # 4. Predicted vs Actual + axes[1,1].scatter(y_true, y_pred, alpha=0.6) + min_val = min(y_true.min(), y_pred.min()) + max_val = max(y_true.max(), y_pred.max()) + axes[1,1].plot([min_val, max_val], [min_val, max_val], 'red', linestyle='--') + axes[1,1].set_xlabel('True Values') + axes[1,1].set_ylabel('Predicted Values') + axes[1,1].set_title('Predicted vs Actual') + axes[1,1].grid(True, alpha=0.3) + + plt.tight_layout() + self._save_and_show(fig, save_name) + + def _save_and_show(self, fig: plt.Figure, save_name: str) -> None: + """Save figure and optionally display it.""" + save_path = self.output_dir / f"{save_name}.png" + fig.savefig(save_path, dpi=self.dpi, bbox_inches='tight', facecolor='white') + print(f"Model plot saved: {save_path}") + + try: + plt.show(block=False) + print("Model plot displayed") + except Exception: + print("Model plot saved (display not available)") + + plt.close(fig) \ No newline at end of file