From 3ab150c4791d82fdb113b4c7cfffbb959dd15de4 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Mon, 19 Jan 2026 16:39:04 +0200 Subject: [PATCH 01/15] updated scikit-learn~=1.5 fixes and patches for new scikit-learn version changes in item.yaml and regenerate function.yaml --- functions/src/auto_trainer/function.yaml | 59 ++++----- functions/src/auto_trainer/item.yaml | 4 +- functions/src/auto_trainer/requirements.txt | 2 +- .../src/auto_trainer/test_auto_trainer.py | 37 +++++- functions/src/describe/function.yaml | 83 ++++++------ functions/src/describe/item.yaml | 4 +- functions/src/describe/requirements.txt | 2 +- functions/src/gen_class_data/function.yaml | 19 +-- functions/src/gen_class_data/item.yaml | 4 +- functions/src/gen_class_data/requirements.txt | 2 +- .../src/gen_class_data/test_gen_class_data.py | 5 +- .../src/sklearn_classifier/function.yaml | 110 ++++++++++++++-- functions/src/sklearn_classifier/item.yaml | 4 +- .../src/sklearn_classifier/requirements.txt | 2 +- .../sklearn_classifier/sklearn_classifier.py | 122 +++++++++++++++++- .../test_sklearn_classifier.py | 28 ++-- 16 files changed, 368 insertions(+), 119 deletions(-) diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index 0920b1033..3020b6521 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -1,22 +1,31 @@ -metadata: - categories: - - machine-learning - - model-training - tag: '' - name: auto-trainer +kind: job spec: - image: mlrun/mlrun build: - origin_filename: '' functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

    # Getting the dataset:
    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
        label_columns = label_columns or dataset.meta.status.label_column
        context.logger.info(f"label columns: {label_columns}")
        # FeatureVector case:
        try:
            fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
            dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
        except AttributeError:
            # Leave here for backwards compatibility
            dataset = fs.get_offline_features(
                dataset.meta.uri, drop_columns=drop_columns
            ).to_dataframe()

    elif not label_columns:
        context.logger.info(
            "label_columns not provided, mandatory when dataset is not a FeatureVector"
        )
        raise ValueError

    elif isinstance(dataset, (list, dict)):
        # list/dict case:
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)

    else:
        # simple URL case:
        dataset = dataset.as_df()
        if drop_columns:
            if all(col in dataset for col in drop_columns):
                dataset = dataset.drop(drop_columns, axis=1)
            else:
                context.logger.info(
                    "not all of the columns to drop in the dataset, drop columns process skipped"
                )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 code_origin: '' + origin_filename: '' + image: mlrun/mlrun + default_handler: train description: Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM. + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py + command: '' disable_auto_mount: false - default_handler: train entry_points: train: lineno: 121 + has_varargs: false + name: train + doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ + \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ + hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ + \ name=\"train\",\n handler=\"train\",\n inputs={\"dataset\"\ + : \"./path/to/dataset.csv\"},\n params={\n \"model_class\"\ + : \"sklearn.linear_model.LogisticRegression\",\n \"label_columns\"\ + : \"label\",\n \"drop_columns\": \"id\",\n \"model_name\"\ + : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ + : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ + ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" parameters: - name: context type: MLClientCtx @@ -70,21 +79,12 @@ spec: type: dict doc: Labels to log with the model default: null - has_varargs: false - name: train has_kwargs: true - doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ - \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ - hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ - \ name=\"train\",\n handler=\"train\",\n inputs={\"dataset\"\ - : \"./path/to/dataset.csv\"},\n params={\n \"model_class\"\ - : \"sklearn.linear_model.LogisticRegression\",\n \"label_columns\"\ - : \"label\",\n \"drop_columns\": \"id\",\n \"model_name\"\ - : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ - : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ - ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" evaluate: lineno: 273 + has_varargs: false + name: evaluate + doc: Evaluating a model. Artifacts generated by the MLHandler. parameters: - name: context type: MLClientCtx @@ -104,12 +104,12 @@ spec: doc: The target label(s) of the column(s) in the dataset. for Regression or Classification tasks. Mandatory when dataset is not a FeatureVector. default: null - has_varargs: false - name: evaluate has_kwargs: true - doc: Evaluating a model. Artifacts generated by the MLHandler. predict: lineno: 327 + has_varargs: false + name: predict + doc: Predicting dataset by a model. parameters: - name: context type: MLClientCtx @@ -138,10 +138,11 @@ spec: doc: The db key to set name of the prediction result and the filename. Default to 'prediction'. default: null - has_varargs: false - name: predict has_kwargs: true - doc: Predicting dataset by a model. - command: '' -kind: job verbose: false +metadata: + name: auto-trainer + categories: + - machine-learning + - model-training + tag: '' diff --git a/functions/src/auto_trainer/item.yaml b/functions/src/auto_trainer/item.yaml index ba33f6a08..d397a79d6 100755 --- a/functions/src/auto_trainer/item.yaml +++ b/functions/src/auto_trainer/item.yaml @@ -13,7 +13,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.10.0 name: auto_trainer platformVersion: 3.5.0 spec: @@ -23,4 +23,4 @@ spec: kind: job requirements: [] url: '' -version: 1.8.0 +version: 1.9.0 diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index b14a0293c..4854d84fd 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn<1.4.0 +scikit-learn~=1.5 xgboost<2.0.0 plotly diff --git a/functions/src/auto_trainer/test_auto_trainer.py b/functions/src/auto_trainer/test_auto_trainer.py index 9a1ff554c..4a517f112 100644 --- a/functions/src/auto_trainer/test_auto_trainer.py +++ b/functions/src/auto_trainer/test_auto_trainer.py @@ -25,6 +25,37 @@ make_regression, ) +# Monkey-patch sklearn metrics to fix MLRun compatibility with sklearn 1.5+ +# MLRun 1.10.0 calls metrics with the deprecated 'squared' parameter +import sklearn.metrics +from sklearn.metrics import ( + mean_squared_error as _original_mse, + mean_absolute_error as _original_mae, + median_absolute_error as _original_medae, +) + + +def _patched_mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average', squared=None): + """Wrapper for mean_squared_error that ignores the deprecated 'squared' parameter.""" + # In sklearn 1.4+, 'squared' parameter was removed. Always return MSE (not RMSE) + return _original_mse(y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput) + + +def _patched_mean_absolute_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average', squared=None): + """Wrapper for mean_absolute_error that ignores any 'squared' parameter.""" + return _original_mae(y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput) + + +def _patched_median_absolute_error(y_true, y_pred, multioutput='uniform_average', sample_weight=None, squared=None): + """Wrapper for median_absolute_error that ignores any 'squared' parameter.""" + return _original_medae(y_true, y_pred, multioutput=multioutput, sample_weight=sample_weight) + + +# Apply the patches +sklearn.metrics.mean_squared_error = _patched_mean_squared_error +sklearn.metrics.mean_absolute_error = _patched_mean_absolute_error +sklearn.metrics.median_absolute_error = _patched_median_absolute_error + MODELS = [ ("sklearn.linear_model.LinearRegression", "regression"), ("sklearn.ensemble.RandomForestClassifier", "classification"), @@ -82,7 +113,7 @@ def test_train(model: Tuple[str, str]): dataset, label_columns = _get_dataset(model[1]) is_test_passed = True - project = mlrun.new_project("auto-trainer-test", context="./") + project = mlrun.get_or_create_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") train_run = None @@ -119,7 +150,7 @@ def test_train_evaluate(model: Tuple[str, str]): dataset, label_columns = _get_dataset(model[1]) is_test_passed = True # Importing function: - project = mlrun.new_project("auto-trainer-test", context="./") + project = mlrun.get_or_create_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") temp_dir = tempfile.mkdtemp() @@ -172,7 +203,7 @@ def test_train_predict(model: Tuple[str, str]): df = pd.read_csv(dataset) sample = df.head().drop("labels", axis=1).values.tolist() # Importing function: - project = mlrun.new_project("auto-trainer-test", context="./") + project = mlrun.get_or_create_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") temp_dir = tempfile.mkdtemp() diff --git a/functions/src/describe/function.yaml b/functions/src/describe/function.yaml index a11461774..7116fae92 100644 --- a/functions/src/describe/function.yaml +++ b/functions/src/describe/function.yaml @@ -1,9 +1,44 @@ +metadata: + tag: '' + categories: + - data-analysis + name: describe +verbose: false +kind: job spec: + command: '' + image: mlrun/mlrun + description: describe and visualizes dataset stats + disable_auto_mount: false + default_handler: analyze entry_points: analyze: + doc: 'The function will output the following artifacts per + + column within the data frame (based on data types) + + If the data has more than 500,000 sample we + + sample randomly 500,000 samples: + + + describe csv + + histograms + + scatter-2d + + violin chart + + correlation-matrix chart + + correlation-matrix csv + + imbalance pie chart + + imbalance-weights-vec csv' + has_kwargs: false has_varargs: false - outputs: - - type: None parameters: - name: context type: MLClientCtx @@ -45,46 +80,12 @@ spec: - name: dask_client doc: Dask client object default: null - doc: 'The function will output the following artifacts per - - column within the data frame (based on data types) - - If the data has more than 500,000 sample we - - sample randomly 500,000 samples: - - - describe csv - - histograms - - scatter-2d - - violin chart - - correlation-matrix chart - - correlation-matrix csv - - imbalance pie chart - - imbalance-weights-vec csv' - has_kwargs: false + outputs: + - type: None name: analyze lineno: 46 - image: mlrun/mlrun - command: '' + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/describe/describe.py build: - functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Generated by nuclio.export.NuclioExporter

import warnings
from typing import Union

import mlrun
import numpy as np

warnings.simplefilter(action="ignore", category=FutureWarning)

import mlrun.feature_store as fstore
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from mlrun.artifacts import (
    Artifact,
    DatasetArtifact,
    PlotlyArtifact,
    TableArtifact,
    update_dataset_meta,
)
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.feature_store import FeatureSet
from plotly.subplots import make_subplots

pd.set_option("display.float_format", lambda x: "%.2f" % x)
MAX_SIZE_OF_DF = 500000


def analyze(
    context: MLClientCtx,
    name: str = "dataset",
    table: Union[FeatureSet, DataItem] = None,
    label_column: str = None,
    plots_dest: str = "plots",
    random_state: int = 1,
    problem_type: str = "classification",
    dask_key: str = "dask_key",
    dask_function: str = None,
    dask_client=None,
) -> None:
    """
    The function will output the following artifacts per
    column within the data frame (based on data types)
    If the data has more than 500,000 sample we
    sample randomly 500,000 samples:

    describe csv
    histograms
    scatter-2d
    violin chart
    correlation-matrix chart
    correlation-matrix csv
    imbalance pie chart
    imbalance-weights-vec csv

    :param context:                 The function context
    :param name:                    Key of dataset to database ("dataset" for default)
    :param table:                   MLRun input pointing to pandas dataframe (csv/parquet file path) or FeatureSet
                                    as param
    :param label_column:            Ground truth column label
    :param plots_dest:              Destination folder of summary plots (relative to artifact_path)
                                    ("plots" for default)
    :param random_state:            When the table has more than 500,000 samples, we sample randomly 500,000 samples
    :param problem_type             The type of the ML problem the data facing - regression, classification or None
                                    (classification for default)
    :param dask_key:                Key of dataframe in dask client "datasets" attribute
    :param dask_function:           Dask function url (db://..)
    :param dask_client:             Dask client object
    """
    data_item, featureset, creat, update = False, False, False, False
    get_from_table = True
    if dask_function or dask_client:
        data_item, creat = True, True
        if dask_function:
            client = mlrun.import_function(dask_function).client
        elif dask_client:
            client = dask_client
        else:
            raise ValueError("dask client was not provided")

        if dask_key in client.datasets:
            df = client.get_dataset(dask_key)
            data_item, creat, get_from_table = True, True, False
        elif table:
            get_from_table = True
        else:
            context.logger.info(
                f"only these datasets are available {client.datasets} in client {client}"
            )
            raise Exception("dataset not found on dask cluster")

    if get_from_table:
        if type(table) == DataItem:
            if table.meta is None:
                data_item, creat, update = True, True, False
            elif table.meta.kind == "dataset":
                data_item, creat, update = True, False, True
            elif table.meta.kind == "FeatureVector":
                data_item, creat, update = True, False, False
            elif table.meta.kind == "FeatureSet":
                featureset, creat, update = True, False, False

        if data_item:
            df = table.as_df()
        elif featureset:
            project_name, set_name = (
                table._path.split("/")[2],
                table._path.split("/")[4],
            )
            feature_set = fstore.get_feature_set(
                f"store://feature-sets/{project_name}/{set_name}"
            )
            df = feature_set.to_dataframe()
        else:
            context.logger.error(f"Wrong table type.")
            return

    if df.size > MAX_SIZE_OF_DF:
        df = df.sample(n=int(MAX_SIZE_OF_DF / df.shape[1]), random_state=random_state)
    extra_data = {}

    if label_column not in df.columns:
        label_column = None

    extra_data["describe csv"] = context.log_artifact(
        TableArtifact("describe-csv", df=df.describe()),
        local_path=f"{plots_dest}/describe.csv",
    )

    try:
        _create_histogram_mat_artifact(
            context, df, extra_data, label_column, plots_dest
        )
    except Exception as e:
        context.logger.warn(f"Failed to create histogram matrix artifact due to: {e}")
    try:
        _create_features_histogram_artifacts(
            context, df, extra_data, label_column, plots_dest, problem_type
        )
    except Exception as e:
        context.logger.warn(f"Failed to create pairplot histograms due to: {e}")
    try:
        _create_features_2d_scatter_artifacts(
            context, df, extra_data, label_column, plots_dest, problem_type
        )
    except Exception as e:
        context.logger.warn(f"Failed to create pairplot 2d_scatter due to: {e}")
    try:
        _create_violin_artifact(context, df, extra_data, plots_dest)
    except Exception as e:
        context.logger.warn(f"Failed to create violin distribution plots due to: {e}")
    try:
        _create_imbalance_artifact(
            context, df, extra_data, label_column, plots_dest, problem_type
        )
    except Exception as e:
        context.logger.warn(f"Failed to create class imbalance plot due to: {e}")
    try:
        _create_corr_artifact(context, df, extra_data, label_column, plots_dest)
    except Exception as e:
        context.logger.warn(f"Failed to create features correlation plot due to: {e}")

    if not data_item:
        return

    artifact = table.artifact_url
    if creat:  # dataset not stored
        artifact = DatasetArtifact(
            key="dataset", stats=True, df=df, extra_data=extra_data
        )
        artifact = context.log_artifact(artifact, db_key=name)
        context.logger.info(f"The data set is logged to the project under {name} name")

    if update:
        update_dataset_meta(artifact, extra_data=extra_data)
        context.logger.info(f"The data set named {name} is updated")

    # TODO : 3-D plot on on selected features.
    # TODO : Reintegration plot on on selected features.
    # TODO : PCA plot (with options)


def _create_histogram_mat_artifact(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
):
    """
    Create and log a histogram matrix artifact
    """
    context.log_artifact(
        item=Artifact(
            key="hist",
            body=b"<b> Deprecated, see the artifacts scatter-2d "
            b"and histograms instead<b>",
        ),
        local_path=f"{plots_dest}/hist.html",
    )


def _create_features_histogram_artifacts(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
    problem_type: str,
):
    """
    Create and log a histogram artifact for each feature
    """

    figs = dict()
    first_feature_name = ""
    if label_column is not None and problem_type == "classification":
        all_labels = df[label_column].unique()
    visible = True
    for column_name in df.columns:
        if column_name == label_column:
            continue

        if label_column is not None and problem_type == "classification":
            for label in all_labels:
                sub_fig = go.Histogram(
                    histfunc="count",
                    x=df.loc[df[label_column] == label][column_name],
                    name=str(label),
                    visible=visible,
                )
                figs[f"{column_name}@?@{label}"] = sub_fig
        else:
            sub_fig = go.Histogram(histfunc="count", x=df[column_name], visible=visible)
            figs[f"{column_name}@?@{1}"] = sub_fig
        if visible:
            first_feature_name = column_name
        visible = False

    fig = go.Figure()
    for k in figs.keys():
        fig.add_trace(figs[k])

    fig.update_layout(
        updatemenus=[
            {
                "buttons": [
                    {
                        "label": column_name,
                        "method": "update",
                        "args": [
                            {
                                "visible": [
                                    key.split("@?@")[0] == column_name
                                    for key in figs.keys()
                                ],
                                "xaxis": {
                                    "range": [
                                        min(df[column_name]),
                                        max(df[column_name]),
                                    ]
                                },
                            },
                            {"title": f"<i><b>Histogram of {column_name}</b></i>"},
                        ],
                    }
                    for column_name in df.columns
                    if column_name != label_column
                ],
                "direction": "down",
                "pad": {"r": 10, "t": 10},
                "showactive": True,
                "x": 0.25,
                "xanchor": "left",
                "y": 1.1,
                "yanchor": "top",
            }
        ],
        annotations=[
            dict(
                text="Select Feature Name ",
                showarrow=False,
                x=0,
                y=1.05,
                yref="paper",
                xref="paper",
                align="left",
                xanchor="left",
                yanchor="top",
                font={
                    "color": "blue",
                },
            )
        ],
    )

    fig.update_layout(
        width=600,
        height=400,
        autosize=False,
        margin=dict(t=100, b=0, l=0, r=0),
        template="plotly_white",
    )

    fig.update_layout(title_text=f"<i><b>Histograms of {first_feature_name}</b></i>")
    extra_data[f"histograms"] = context.log_artifact(
        PlotlyArtifact(key=f"histograms", figure=fig),
        local_path=f"{plots_dest}/histograms.html",
    )


def _create_features_2d_scatter_artifacts(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
    problem_type: str,
):
    """
    Create and log a scatter-2d artifact for each couple of features
    """
    features = [
        column_name for column_name in df.columns if column_name != label_column
    ]
    max_feature_len = float(max(len(elem) for elem in features))
    if label_column is not None:
        labels = sorted(df[label_column].unique())
    else:
        labels = [None]
    fig = go.Figure()
    if label_column is not None and problem_type == "classification":
        for l in labels:
            fig.add_trace(
                go.Scatter(
                    x=df.loc[df[label_column] == l][features[0]],
                    y=df.loc[df[label_column] == l][features[0]],
                    mode="markers",
                    visible=True,
                    showlegend=True,
                    name=str(l),
                )
            )
    elif label_column is None:
        fig.add_trace(
            go.Scatter(
                x=df[features[0]],
                y=df[features[0]],
                mode="markers",
                visible=True,
            )
        )
    elif problem_type == "regression":
        fig.add_trace(
            go.Scatter(
                x=df[features[0]],
                y=df[features[0]],
                mode="markers",
                marker=dict(
                    color=df[label_column], colorscale="Viridis", showscale=True
                ),
                visible=True,
            )
        )

    x_buttons = []
    y_buttons = []

    for ncol in features:
        if problem_type == "classification" and label_column is not None:
            x_buttons.append(
                dict(
                    method="update",
                    label=ncol,
                    args=[
                        {"x": [df.loc[df[label_column] == l][ncol] for l in labels]},
                        np.arange(len(labels)).tolist(),
                    ],
                )
            )

            y_buttons.append(
                dict(
                    method="update",
                    label=ncol,
                    args=[
                        {"y": [df.loc[df[label_column] == l][ncol] for l in labels]},
                        np.arange(len(labels)).tolist(),
                    ],
                )
            )
        else:
            x_buttons.append(
                dict(method="update", label=ncol, args=[{"x": [df[ncol]]}])
            )

            y_buttons.append(
                dict(method="update", label=ncol, args=[{"y": [df[ncol]]}])
            )

    # Pass buttons to the updatemenus argument
    fig.update_layout(
        updatemenus=[
            dict(buttons=x_buttons, direction="up", x=0.5, y=-0.1),
            dict(buttons=y_buttons, direction="down", x=-max_feature_len / 100, y=0.5),
        ]
    )

    fig.update_layout(
        width=600,
        height=400,
        autosize=False,
        margin=dict(t=100, b=0, l=0, r=0),
        template="plotly_white",
    )

    fig.update_layout(title_text=f"<i><b>Scatter-2d</b></i>")
    extra_data[f"scatter-2d"] = context.log_artifact(
        PlotlyArtifact(key=f"scatter-2d", figure=fig),
        local_path=f"{plots_dest}/scatter-2d.html",
    )


def _create_violin_artifact(
    context: MLClientCtx, df: pd.DataFrame, extra_data: dict, plots_dest: str
):
    """
    Create and log a violin artifact
    """
    cols = 5
    rows = (df.shape[1] // cols) + 1
    fig = make_subplots(rows=rows, cols=cols)

    plot_num = 0

    for column_name in df.columns:
        column_data = df[column_name]
        violin = go.Violin(
            x=[column_name] * column_data.shape[0],
            y=column_data,
            name=column_name,
        )

        fig.add_trace(
            violin,
            row=(plot_num // cols) + 1,
            col=(plot_num % cols) + 1,
        )

        plot_num += 1

    fig["layout"].update(
        height=(rows + 1) * 200,
        width=(cols + 1) * 200,
        title="<i><b>Violin Plots</b></i>",
    )

    fig.update_layout(showlegend=False)
    extra_data["violin"] = context.log_artifact(
        PlotlyArtifact(key="violin", figure=fig),
        local_path=f"{plots_dest}/violin.html",
    )


def _create_imbalance_artifact(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
    problem_type: str,
):
    """
    Create and log an imbalance class artifact (csv + plot)
    """
    if label_column:
        if problem_type == "classification":
            values_column = "count"
            labels_count = df[label_column].value_counts().sort_index()
            df_labels_count = pd.DataFrame(labels_count)
            df_labels_count[label_column] = labels_count.index
            df_labels_count.rename(columns={"": values_column}, inplace=True)
            df_labels_count[values_column] = df_labels_count[values_column] / sum(
                df_labels_count[values_column]
            )
            fig = px.pie(df_labels_count, names=label_column, values=values_column)
        else:
            fig = px.histogram(
                histfunc="count",
                x=df[label_column],
            )
            hist = np.histogram(df[label_column])
            df_labels_count = pd.DataFrame(
                {"min_val": hist[1], "count": hist[0].tolist() + [0]}
            )
        fig.update_layout(title_text="<i><b>Labels Imbalance</b></i>")
        extra_data["imbalance"] = context.log_artifact(
            PlotlyArtifact(key="imbalance", figure=fig),
            local_path=f"{plots_dest}/imbalance.html",
        )
        extra_data["imbalance-csv"] = context.log_artifact(
            TableArtifact("imbalance-weights-vec", df=df_labels_count),
            local_path=f"{plots_dest}/imbalance-weights-vec.csv",
        )


def _create_corr_artifact(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
):
    """
    Create and log an correlation-matrix artifact (csv + plot)
    """
    if label_column is not None:
        df = df.drop([label_column], axis=1)
    tblcorr = df.corr(numeric_only=True)
    extra_data["correlation-matrix-csv"] = context.log_artifact(
        TableArtifact("correlation-matrix-csv", df=tblcorr, visible=True),
        local_path=f"{plots_dest}/correlation-matrix.csv",
    )

    z = tblcorr.values.tolist()
    z_text = [["{:.2f}".format(y) for y in x] for x in z]
    fig = ff.create_annotated_heatmap(
        z,
        x=list(tblcorr.columns),
        y=list(tblcorr.columns),
        annotation_text=z_text,
        colorscale="agsunset",
    )
    fig["layout"]["yaxis"]["autorange"] = "reversed"  # l -> r
    fig.update_layout(title_text="<i><b>Correlation matrix</b></i>")
    fig["data"][0]["showscale"] = True

    extra_data["correlation"] = context.log_artifact(
        PlotlyArtifact(key="correlation", figure=fig),
        local_path=f"{plots_dest}/correlation.html",
    )
 - code_origin: '' origin_filename: '' - description: describe and visualizes dataset stats - disable_auto_mount: false - default_handler: analyze -verbose: false -metadata: - tag: '' - name: describe - categories: - - data-analysis -kind: job + code_origin: '' + functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Generated by nuclio.export.NuclioExporter

import warnings
from typing import Union

import mlrun
import numpy as np

warnings.simplefilter(action="ignore", category=FutureWarning)

import mlrun.feature_store as fstore
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from mlrun.artifacts import (
    Artifact,
    DatasetArtifact,
    PlotlyArtifact,
    TableArtifact,
    update_dataset_meta,
)
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.feature_store import FeatureSet
from plotly.subplots import make_subplots

pd.set_option("display.float_format", lambda x: "%.2f" % x)
MAX_SIZE_OF_DF = 500000


def analyze(
    context: MLClientCtx,
    name: str = "dataset",
    table: Union[FeatureSet, DataItem] = None,
    label_column: str = None,
    plots_dest: str = "plots",
    random_state: int = 1,
    problem_type: str = "classification",
    dask_key: str = "dask_key",
    dask_function: str = None,
    dask_client=None,
) -> None:
    """
    The function will output the following artifacts per
    column within the data frame (based on data types)
    If the data has more than 500,000 sample we
    sample randomly 500,000 samples:

    describe csv
    histograms
    scatter-2d
    violin chart
    correlation-matrix chart
    correlation-matrix csv
    imbalance pie chart
    imbalance-weights-vec csv

    :param context:                 The function context
    :param name:                    Key of dataset to database ("dataset" for default)
    :param table:                   MLRun input pointing to pandas dataframe (csv/parquet file path) or FeatureSet
                                    as param
    :param label_column:            Ground truth column label
    :param plots_dest:              Destination folder of summary plots (relative to artifact_path)
                                    ("plots" for default)
    :param random_state:            When the table has more than 500,000 samples, we sample randomly 500,000 samples
    :param problem_type             The type of the ML problem the data facing - regression, classification or None
                                    (classification for default)
    :param dask_key:                Key of dataframe in dask client "datasets" attribute
    :param dask_function:           Dask function url (db://..)
    :param dask_client:             Dask client object
    """
    data_item, featureset, creat, update = False, False, False, False
    get_from_table = True
    if dask_function or dask_client:
        data_item, creat = True, True
        if dask_function:
            client = mlrun.import_function(dask_function).client
        elif dask_client:
            client = dask_client
        else:
            raise ValueError("dask client was not provided")

        if dask_key in client.datasets:
            df = client.get_dataset(dask_key)
            data_item, creat, get_from_table = True, True, False
        elif table:
            get_from_table = True
        else:
            context.logger.info(
                f"only these datasets are available {client.datasets} in client {client}"
            )
            raise Exception("dataset not found on dask cluster")

    if get_from_table:
        if type(table) == DataItem:
            if table.meta is None:
                data_item, creat, update = True, True, False
            elif table.meta.kind == "dataset":
                data_item, creat, update = True, False, True
            elif table.meta.kind == "FeatureVector":
                data_item, creat, update = True, False, False
            elif table.meta.kind == "FeatureSet":
                featureset, creat, update = True, False, False

        if data_item:
            df = table.as_df()
        elif featureset:
            project_name, set_name = (
                table._path.split("/")[2],
                table._path.split("/")[4],
            )
            feature_set = fstore.get_feature_set(
                f"store://feature-sets/{project_name}/{set_name}"
            )
            df = feature_set.to_dataframe()
        else:
            context.logger.error(f"Wrong table type.")
            return

    if df.size > MAX_SIZE_OF_DF:
        df = df.sample(n=int(MAX_SIZE_OF_DF / df.shape[1]), random_state=random_state)
    extra_data = {}

    if label_column not in df.columns:
        label_column = None

    extra_data["describe csv"] = context.log_artifact(
        TableArtifact("describe-csv", df=df.describe()),
        local_path=f"{plots_dest}/describe.csv",
    )

    try:
        _create_histogram_mat_artifact(
            context, df, extra_data, label_column, plots_dest
        )
    except Exception as e:
        context.logger.warn(f"Failed to create histogram matrix artifact due to: {e}")
    try:
        _create_features_histogram_artifacts(
            context, df, extra_data, label_column, plots_dest, problem_type
        )
    except Exception as e:
        context.logger.warn(f"Failed to create pairplot histograms due to: {e}")
    try:
        _create_features_2d_scatter_artifacts(
            context, df, extra_data, label_column, plots_dest, problem_type
        )
    except Exception as e:
        context.logger.warn(f"Failed to create pairplot 2d_scatter due to: {e}")
    try:
        _create_violin_artifact(context, df, extra_data, plots_dest)
    except Exception as e:
        context.logger.warn(f"Failed to create violin distribution plots due to: {e}")
    try:
        _create_imbalance_artifact(
            context, df, extra_data, label_column, plots_dest, problem_type
        )
    except Exception as e:
        context.logger.warn(f"Failed to create class imbalance plot due to: {e}")
    try:
        _create_corr_artifact(context, df, extra_data, label_column, plots_dest)
    except Exception as e:
        context.logger.warn(f"Failed to create features correlation plot due to: {e}")

    if not data_item:
        return

    artifact = table.artifact_url
    if creat:  # dataset not stored
        artifact = DatasetArtifact(
            key="dataset", stats=True, df=df, extra_data=extra_data
        )
        artifact = context.log_artifact(artifact, db_key=name)
        context.logger.info(f"The data set is logged to the project under {name} name")

    if update:
        update_dataset_meta(artifact, extra_data=extra_data)
        context.logger.info(f"The data set named {name} is updated")

    # TODO : 3-D plot on on selected features.
    # TODO : Reintegration plot on on selected features.
    # TODO : PCA plot (with options)


def _create_histogram_mat_artifact(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
):
    """
    Create and log a histogram matrix artifact
    """
    context.log_artifact(
        item=Artifact(
            key="hist",
            body=b"<b> Deprecated, see the artifacts scatter-2d "
            b"and histograms instead<b>",
        ),
        local_path=f"{plots_dest}/hist.html",
    )


def _create_features_histogram_artifacts(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
    problem_type: str,
):
    """
    Create and log a histogram artifact for each feature
    """

    figs = dict()
    first_feature_name = ""
    if label_column is not None and problem_type == "classification":
        all_labels = df[label_column].unique()
    visible = True
    for column_name in df.columns:
        if column_name == label_column:
            continue

        if label_column is not None and problem_type == "classification":
            for label in all_labels:
                sub_fig = go.Histogram(
                    histfunc="count",
                    x=df.loc[df[label_column] == label][column_name],
                    name=str(label),
                    visible=visible,
                )
                figs[f"{column_name}@?@{label}"] = sub_fig
        else:
            sub_fig = go.Histogram(histfunc="count", x=df[column_name], visible=visible)
            figs[f"{column_name}@?@{1}"] = sub_fig
        if visible:
            first_feature_name = column_name
        visible = False

    fig = go.Figure()
    for k in figs.keys():
        fig.add_trace(figs[k])

    fig.update_layout(
        updatemenus=[
            {
                "buttons": [
                    {
                        "label": column_name,
                        "method": "update",
                        "args": [
                            {
                                "visible": [
                                    key.split("@?@")[0] == column_name
                                    for key in figs.keys()
                                ],
                                "xaxis": {
                                    "range": [
                                        min(df[column_name]),
                                        max(df[column_name]),
                                    ]
                                },
                            },
                            {"title": f"<i><b>Histogram of {column_name}</b></i>"},
                        ],
                    }
                    for column_name in df.columns
                    if column_name != label_column
                ],
                "direction": "down",
                "pad": {"r": 10, "t": 10},
                "showactive": True,
                "x": 0.25,
                "xanchor": "left",
                "y": 1.1,
                "yanchor": "top",
            }
        ],
        annotations=[
            dict(
                text="Select Feature Name ",
                showarrow=False,
                x=0,
                y=1.05,
                yref="paper",
                xref="paper",
                align="left",
                xanchor="left",
                yanchor="top",
                font={
                    "color": "blue",
                },
            )
        ],
    )

    fig.update_layout(
        width=600,
        height=400,
        autosize=False,
        margin=dict(t=100, b=0, l=0, r=0),
        template="plotly_white",
    )

    fig.update_layout(title_text=f"<i><b>Histograms of {first_feature_name}</b></i>")
    extra_data[f"histograms"] = context.log_artifact(
        PlotlyArtifact(key=f"histograms", figure=fig),
        local_path=f"{plots_dest}/histograms.html",
    )


def _create_features_2d_scatter_artifacts(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
    problem_type: str,
):
    """
    Create and log a scatter-2d artifact for each couple of features
    """
    features = [
        column_name for column_name in df.columns if column_name != label_column
    ]
    max_feature_len = float(max(len(elem) for elem in features))
    if label_column is not None:
        labels = sorted(df[label_column].unique())
    else:
        labels = [None]
    fig = go.Figure()
    if label_column is not None and problem_type == "classification":
        for l in labels:
            fig.add_trace(
                go.Scatter(
                    x=df.loc[df[label_column] == l][features[0]],
                    y=df.loc[df[label_column] == l][features[0]],
                    mode="markers",
                    visible=True,
                    showlegend=True,
                    name=str(l),
                )
            )
    elif label_column is None:
        fig.add_trace(
            go.Scatter(
                x=df[features[0]],
                y=df[features[0]],
                mode="markers",
                visible=True,
            )
        )
    elif problem_type == "regression":
        fig.add_trace(
            go.Scatter(
                x=df[features[0]],
                y=df[features[0]],
                mode="markers",
                marker=dict(
                    color=df[label_column], colorscale="Viridis", showscale=True
                ),
                visible=True,
            )
        )

    x_buttons = []
    y_buttons = []

    for ncol in features:
        if problem_type == "classification" and label_column is not None:
            x_buttons.append(
                dict(
                    method="update",
                    label=ncol,
                    args=[
                        {"x": [df.loc[df[label_column] == l][ncol] for l in labels]},
                        np.arange(len(labels)).tolist(),
                    ],
                )
            )

            y_buttons.append(
                dict(
                    method="update",
                    label=ncol,
                    args=[
                        {"y": [df.loc[df[label_column] == l][ncol] for l in labels]},
                        np.arange(len(labels)).tolist(),
                    ],
                )
            )
        else:
            x_buttons.append(
                dict(method="update", label=ncol, args=[{"x": [df[ncol]]}])
            )

            y_buttons.append(
                dict(method="update", label=ncol, args=[{"y": [df[ncol]]}])
            )

    # Pass buttons to the updatemenus argument
    fig.update_layout(
        updatemenus=[
            dict(buttons=x_buttons, direction="up", x=0.5, y=-0.1),
            dict(buttons=y_buttons, direction="down", x=-max_feature_len / 100, y=0.5),
        ]
    )

    fig.update_layout(
        width=600,
        height=400,
        autosize=False,
        margin=dict(t=100, b=0, l=0, r=0),
        template="plotly_white",
    )

    fig.update_layout(title_text=f"<i><b>Scatter-2d</b></i>")
    extra_data[f"scatter-2d"] = context.log_artifact(
        PlotlyArtifact(key=f"scatter-2d", figure=fig),
        local_path=f"{plots_dest}/scatter-2d.html",
    )


def _create_violin_artifact(
    context: MLClientCtx, df: pd.DataFrame, extra_data: dict, plots_dest: str
):
    """
    Create and log a violin artifact
    """
    cols = 5
    rows = (df.shape[1] // cols) + 1
    fig = make_subplots(rows=rows, cols=cols)

    plot_num = 0

    for column_name in df.columns:
        column_data = df[column_name]
        violin = go.Violin(
            x=[column_name] * column_data.shape[0],
            y=column_data,
            name=column_name,
        )

        fig.add_trace(
            violin,
            row=(plot_num // cols) + 1,
            col=(plot_num % cols) + 1,
        )

        plot_num += 1

    fig["layout"].update(
        height=(rows + 1) * 200,
        width=(cols + 1) * 200,
        title="<i><b>Violin Plots</b></i>",
    )

    fig.update_layout(showlegend=False)
    extra_data["violin"] = context.log_artifact(
        PlotlyArtifact(key="violin", figure=fig),
        local_path=f"{plots_dest}/violin.html",
    )


def _create_imbalance_artifact(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
    problem_type: str,
):
    """
    Create and log an imbalance class artifact (csv + plot)
    """
    if label_column:
        if problem_type == "classification":
            values_column = "count"
            labels_count = df[label_column].value_counts().sort_index()
            df_labels_count = pd.DataFrame(labels_count)
            df_labels_count[label_column] = labels_count.index
            df_labels_count.rename(columns={"": values_column}, inplace=True)
            df_labels_count[values_column] = df_labels_count[values_column] / sum(
                df_labels_count[values_column]
            )
            fig = px.pie(df_labels_count, names=label_column, values=values_column)
        else:
            fig = px.histogram(
                histfunc="count",
                x=df[label_column],
            )
            hist = np.histogram(df[label_column])
            df_labels_count = pd.DataFrame(
                {"min_val": hist[1], "count": hist[0].tolist() + [0]}
            )
        fig.update_layout(title_text="<i><b>Labels Imbalance</b></i>")
        extra_data["imbalance"] = context.log_artifact(
            PlotlyArtifact(key="imbalance", figure=fig),
            local_path=f"{plots_dest}/imbalance.html",
        )
        extra_data["imbalance-csv"] = context.log_artifact(
            TableArtifact("imbalance-weights-vec", df=df_labels_count),
            local_path=f"{plots_dest}/imbalance-weights-vec.csv",
        )


def _create_corr_artifact(
    context: MLClientCtx,
    df: pd.DataFrame,
    extra_data: dict,
    label_column: str,
    plots_dest: str,
):
    """
    Create and log an correlation-matrix artifact (csv + plot)
    """
    if label_column is not None:
        df = df.drop([label_column], axis=1)
    tblcorr = df.corr(numeric_only=True)
    extra_data["correlation-matrix-csv"] = context.log_artifact(
        TableArtifact("correlation-matrix-csv", df=tblcorr, visible=True),
        local_path=f"{plots_dest}/correlation-matrix.csv",
    )

    z = tblcorr.values.tolist()
    z_text = [["{:.2f}".format(y) for y in x] for x in z]
    fig = ff.create_annotated_heatmap(
        z,
        x=list(tblcorr.columns),
        y=list(tblcorr.columns),
        annotation_text=z_text,
        colorscale="agsunset",
    )
    fig["layout"]["yaxis"]["autorange"] = "reversed"  # l -> r
    fig.update_layout(title_text="<i><b>Correlation matrix</b></i>")
    fig["data"][0]["showscale"] = True

    extra_data["correlation"] = context.log_artifact(
        PlotlyArtifact(key="correlation", figure=fig),
        local_path=f"{plots_dest}/correlation.html",
    )
 diff --git a/functions/src/describe/item.yaml b/functions/src/describe/item.yaml index da26f1501..a1aa47372 100644 --- a/functions/src/describe/item.yaml +++ b/functions/src/describe/item.yaml @@ -11,7 +11,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.10.0 name: describe platformVersion: 3.5.3 spec: @@ -21,4 +21,4 @@ spec: kind: job requirements: [] url: '' -version: 1.4.0 +version: 1.5.0 diff --git a/functions/src/describe/requirements.txt b/functions/src/describe/requirements.txt index 15492b176..7a15c8465 100644 --- a/functions/src/describe/requirements.txt +++ b/functions/src/describe/requirements.txt @@ -1,4 +1,4 @@ -scikit-learn~=1.0.2 +scikit-learn~=1.5 plotly~=5.23 pytest~=7.0.1 matplotlib~=3.5.1 diff --git a/functions/src/gen_class_data/function.yaml b/functions/src/gen_class_data/function.yaml index 1769bec07..fde89341e 100644 --- a/functions/src/gen_class_data/function.yaml +++ b/functions/src/gen_class_data/function.yaml @@ -1,13 +1,15 @@ metadata: - categories: - - data-generation tag: '' name: gen-class-data + categories: + - data-generation +verbose: false spec: description: Create a binary classification sample dataset and save. - default_handler: gen_class_data entry_points: gen_class_data: + lineno: 22 + has_varargs: false has_kwargs: false parameters: - name: context @@ -48,7 +50,6 @@ spec: - name: sk_params doc: additional parameters for `sklearn.datasets.make_classification` default: {} - lineno: 22 doc: 'Create a binary classification sample dataset and save. If no filename is given it will default to: @@ -59,14 +60,14 @@ spec: Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.' - has_varargs: false name: gen_class_data - command: '' - disable_auto_mount: false - image: mlrun/mlrun build: origin_filename: '' functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo= code_origin: '' + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/gen_class_data/gen_class_data.py + command: '' + image: mlrun/mlrun + default_handler: gen_class_data + disable_auto_mount: false kind: job -verbose: false diff --git a/functions/src/gen_class_data/item.yaml b/functions/src/gen_class_data/item.yaml index 30f5cd21c..082b00305 100644 --- a/functions/src/gen_class_data/item.yaml +++ b/functions/src/gen_class_data/item.yaml @@ -11,7 +11,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.10.0 name: gen_class_data platformVersion: 3.5.3 spec: @@ -21,4 +21,4 @@ spec: kind: job requirements: [] url: '' -version: 1.3.0 +version: 1.4.0 diff --git a/functions/src/gen_class_data/requirements.txt b/functions/src/gen_class_data/requirements.txt index d7dbe376b..fc53d535f 100644 --- a/functions/src/gen_class_data/requirements.txt +++ b/functions/src/gen_class_data/requirements.txt @@ -1,2 +1,2 @@ pandas -scikit-learn==1.0.2 \ No newline at end of file +scikit-learn~=1.5 \ No newline at end of file diff --git a/functions/src/gen_class_data/test_gen_class_data.py b/functions/src/gen_class_data/test_gen_class_data.py index e06eeb16b..990075dec 100644 --- a/functions/src/gen_class_data/test_gen_class_data.py +++ b/functions/src/gen_class_data/test_gen_class_data.py @@ -36,4 +36,7 @@ def test_gen_class_data(): local=True, artifact_path="./artifacts", ) - assert os.path.isfile(run.status.artifacts[0]['spec']['target_path']), 'dataset is not available' + # In local mode, artifacts are in function-name/iteration subdirectory + # Default key is "classifier-data" (can be overridden in params) + dataset_path = "./artifacts/test-gen-class-data-gen-class-data/0/classifier-data.csv" + assert os.path.isfile(dataset_path), f'dataset is not available at {dataset_path}' diff --git a/functions/src/sklearn_classifier/function.yaml b/functions/src/sklearn_classifier/function.yaml index 205df697d..603922c95 100644 --- a/functions/src/sklearn_classifier/function.yaml +++ b/functions/src/sklearn_classifier/function.yaml @@ -1,10 +1,98 @@ +kind: job spec: - image: mlrun/mlrun - description: train any classifier using scikit-learn's API default_handler: train_model + command: '' + image: mlrun/mlrun entry_points: + get_sample: + has_kwargs: false + has_varargs: false + lineno: 33 + parameters: + - name: dataset + type: DataItem + doc: DataItem containing the dataset + - name: sample + type: int + doc: Number of samples to take. If -1, use all. If < -1, take random sample. + - name: label_column + type: str + doc: Name of the label column + outputs: + - type: Tuple[pd.DataFrame, pd.Series, list] + name: get_sample + doc: Get a sample of the dataset with labels separated. + get_splits: + has_kwargs: false + has_varargs: false + lineno: 56 + parameters: + - name: features + type: DataFrame + doc: Feature DataFrame + - name: labels + type: Series + doc: Labels Series + - name: num_splits + type: int + doc: Number of splits (3 for train/val/test) + - name: test_size + type: float + doc: Proportion for test set + - name: val_size + type: float + doc: Proportion of remaining data for validation + - name: random_state + type: int + doc: Random seed + default: 1 + outputs: + - type: List[Tuple[pd.DataFrame, pd.Series]] + name: get_splits + doc: Split data into train, validation, and test sets. + gen_sklearn_model: + has_kwargs: false + has_varargs: false + lineno: 86 + parameters: + - name: model_pkg_class + type: str + doc: Full class path (e.g., "sklearn.ensemble.RandomForestClassifier") + - name: parameters + type: list + doc: List of (key, value) parameter tuples + outputs: + - type: dict + name: gen_sklearn_model + doc: Generate sklearn model configuration from class name and parameters. + eval_model_v2: + has_kwargs: false + has_varargs: false + lineno: 117 + parameters: + - name: context + type: MLClientCtx + doc: MLRun context + - name: xvalid + type: DataFrame + doc: Validation features + - name: yvalid + type: Series + doc: Validation labels + - name: model + doc: Trained sklearn model + - name: plots_artifact_path + type: str + doc: Path for plots (not used in this simplified version) + default: null + outputs: + - type: dict + name: eval_model_v2 + doc: Evaluate a sklearn classifier model. train_model: + has_kwargs: false has_varargs: false + lineno: 148 parameters: - name: context type: MLClientCtx @@ -28,7 +116,7 @@ spec: type: int doc: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample - default: + default: - name: test_size type: float doc: (0.05) test set size @@ -64,6 +152,8 @@ spec: type: int doc: (1) sklearn rng seed default: 1 + outputs: + - type: None name: train_model doc: 'train a classifier @@ -76,21 +166,17 @@ spec: scalar "results", a "plots" keys with a list of PlotArtifacts, and and "tables" key containing a returned list of TableArtifacts.' - outputs: - - type: None - lineno: 32 - has_kwargs: false - disable_auto_mount: false build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKIyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzCmltcG9ydCBwYW5kYXMgYXMgcGQKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLm1sdXRpbHMuZGF0YSBpbXBvcnQgZ2V0X3NhbXBsZSwgZ2V0X3NwbGl0cwpmcm9tIG1scnVuLm1sdXRpbHMubW9kZWxzIGltcG9ydCBnZW5fc2tsZWFybl9tb2RlbCwgZXZhbF9tb2RlbF92Mgpmcm9tIG1scnVuLnV0aWxzLmhlbHBlcnMgaW1wb3J0IGNyZWF0ZV9jbGFzcwoKCmRlZiB0cmFpbl9tb2RlbCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgbW9kZWxfcGtnX2NsYXNzOiBzdHIsCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsX2NvbHVtbjogc3RyID0gImxhYmVscyIsCiAgICBlbmNvZGVfY29sczogTGlzdFtzdHJdID0gW10sCiAgICBzYW1wbGU6IGludCA9IC0xLAogICAgdGVzdF9zaXplOiBmbG9hdCA9IDAuMzAsCiAgICB0cmFpbl92YWxfc3BsaXQ6IGZsb2F0ID0gMC43MCwKICAgIHRlc3Rfc2V0X2tleTogc3RyID0gInRlc3Rfc2V0IiwKICAgIG1vZGVsX2V2YWx1YXRvcj1Ob25lLAogICAgbW9kZWxzX2Rlc3Q6IHN0ciA9ICIiLAogICAgcGxvdHNfZGVzdDogc3RyID0gInBsb3RzIiwKICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICBtb2RlbF9wa2dfZmlsZTogc3RyID0gIiIsCiAgICByYW5kb21fc3RhdGU6IGludCA9IDEsCikgLT4gTm9uZToKICAgICIiInRyYWluIGEgY2xhc3NpZmllcgoKICAgIEFuIG9wdGlvbmFsIGN1dG9tIG1vZGVsIGV2YWx1YXRvciBjYW4gYmUgc3VwcGxpZWQgdGhhdCBzaG91bGQgaGF2ZSB0aGUgc2lnbmF0dXJlOgogICAgYG15X2N1c3RvbV9ldmFsdWF0b3IoY29udGV4dCwgeHZhbGlkLCB5dmFsaWQsIG1vZGVsKWAgYW5kIHJldHVybiBhIGRpY3Rpb25hcnkgb2YKICAgIHNjYWxhciAicmVzdWx0cyIsIGEgInBsb3RzIiBrZXlzIHdpdGggYSBsaXN0IG9mIFBsb3RBcnRpZmFjdHMsIGFuZAogICAgYW5kICJ0YWJsZXMiIGtleSBjb250YWluaW5nIGEgcmV0dXJuZWQgbGlzdCBvZiBUYWJsZUFydGlmYWN0cy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gbW9kZWxfcGtnX2NsYXNzOiAgIHRoZSBtb2RlbCB0byB0cmFpbiwgZS5nLCAic2tsZWFybi5uZXVyYWxfbmV0d29ya3MuTUxQQ2xhc3NpZmllciIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIG9yIGpzb24gbW9kZWwgY29uZmlnCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgICAgICgiZGF0YSIpIG5hbWUgb2YgcmF3IGRhdGEgZmlsZQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogICAgICBncm91bmQtdHJ1dGggKHkpIGxhYmVscwogICAgOnBhcmFtIGVuY29kZV9jb2xzOiAgICAgICBkaWN0aW9uYXJ5IG9mIG5hbWVzIGFuZCBwcmVmaXhlcyBmb3IgY29sdW1ucyB0aGF0IGFyZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0byBob3QgYmUgZW5jb2RlZC4KICAgIDpwYXJhbSBzYW1wbGU6ICAgICAgICAgICAgU2VsZWN0cyB0aGUgZmlyc3QgbiByb3dzLCBvciBzZWxlY3QgYSBzYW1wbGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RhcnRpbmcgZnJvbSB0aGUgZmlyc3QuIElmIG5lZ2F0aXZlIDwtMSwgc2VsZWN0CiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGEgcmFuZG9tIHNhbXBsZQogICAgOnBhcmFtIHRlc3Rfc2l6ZTogICAgICAgICAoMC4wNSkgdGVzdCBzZXQgc2l6ZQogICAgOnBhcmFtIHRyYWluX3ZhbF9zcGxpdDogICAoMC43NSkgT25jZSB0aGUgdGVzdCBzZXQgaGFzIGJlZW4gcmVtb3ZlZCB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHJhaW5pbmcgc2V0IGdldHMgdGhpcyBwcm9wb3J0aW9uLgogICAgOnBhcmFtIHRlc3Rfc2V0X2tleTogICAgICBrZXkgb2YgaGVsZCBvdXQgZGF0YSBpbiBhcnRpZmFjdCBzdG9yZQogICAgOnBhcmFtIG1vZGVsX2V2YWx1YXRvcjogICAoTm9uZSkgYSBjdXN0b20gbW9kZWwgZXZhbHVhdG9yIGNhbiBiZSBzcGVjaWZpZWQKICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgICAgKCIiKSBtb2RlbHMgc3ViZm9sZGVyIG9uIGFydGlmYWN0IHBhdGgKICAgIDpwYXJhbSBwbG90c19kZXN0OiAgICAgICAgcGxvdCBzdWJmb2xkZXIgb24gYXJ0aWZhY3QgcGF0aAogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAgICAoInBhcnF1ZXQiKSBmb3JtYXQgZm9yIHRlc3Rfc2V0X2tleSBob2xkIG91dCBkYXRhCiAgICA6cGFyYW0gcmFuZG9tX3N0YXRlOiAgICAgICgxKSBza2xlYXJuIHJuZyBzZWVkCgogICAgIiIiCiAgICBtb2RlbHNfZGVzdCA9IG1vZGVsc19kZXN0IG9yICJtb2RlbCIKCiAgICByYXcsIGxhYmVscywgaGVhZGVyID0gZ2V0X3NhbXBsZShkYXRhc2V0LCBzYW1wbGUsIGxhYmVsX2NvbHVtbikKCiAgICBpZiBlbmNvZGVfY29sczoKICAgICAgICByYXcgPSBwZC5nZXRfZHVtbWllcygKICAgICAgICAgICAgcmF3LAogICAgICAgICAgICBjb2x1bW5zPWxpc3QoZW5jb2RlX2NvbHMua2V5cygpKSwKICAgICAgICAgICAgcHJlZml4PWxpc3QoZW5jb2RlX2NvbHMudmFsdWVzKCkpLAogICAgICAgICAgICBkcm9wX2ZpcnN0PVRydWUsCiAgICAgICAgKQoKICAgICh4dHJhaW4sIHl0cmFpbiksICh4dmFsaWQsIHl2YWxpZCksICh4dGVzdCwgeXRlc3QpID0gZ2V0X3NwbGl0cygKICAgICAgICByYXcsIGxhYmVscywgMywgdGVzdF9zaXplLCAxIC0gdHJhaW5fdmFsX3NwbGl0LCByYW5kb21fc3RhdGUKICAgICkKCiAgICB0ZXN0X3NldCA9IHBkLmNvbmNhdChbeHRlc3QsIHl0ZXN0LnRvX2ZyYW1lKCldLCBheGlzPTEpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgIHRlc3Rfc2V0X2tleSwKICAgICAgICBkZj10ZXN0X3NldCwKICAgICAgICBmb3JtYXQ9ZmlsZV9leHQsCiAgICAgICAgaW5kZXg9RmFsc2UsCiAgICAgICAgbGFiZWxzPXsiZGF0YS10eXBlIjogImhlbGQtb3V0In0sCiAgICAgICAgYXJ0aWZhY3RfcGF0aD1jb250ZXh0LmFydGlmYWN0X3N1YnBhdGgoImRhdGEiKSwKICAgICkKCiAgICBtb2RlbF9jb25maWcgPSBnZW5fc2tsZWFybl9tb2RlbChtb2RlbF9wa2dfY2xhc3MsIGNvbnRleHQucGFyYW1ldGVycy5pdGVtcygpKQoKICAgIG1vZGVsX2NvbmZpZ1siRklUIl0udXBkYXRlKHsiWCI6IHh0cmFpbiwgInkiOiB5dHJhaW4udmFsdWVzfSkKCiAgICBDbGFzc2lmaWVyQ2xhc3MgPSBjcmVhdGVfY2xhc3MobW9kZWxfY29uZmlnWyJNRVRBIl1bImNsYXNzIl0pCgogICAgbW9kZWwgPSBDbGFzc2lmaWVyQ2xhc3MoKiptb2RlbF9jb25maWdbIkNMQVNTIl0pCgogICAgbW9kZWwuZml0KCoqbW9kZWxfY29uZmlnWyJGSVQiXSkKCiAgICBhcnRpZmFjdF9wYXRoID0gY29udGV4dC5hcnRpZmFjdF9zdWJwYXRoKG1vZGVsc19kZXN0KQogICAgcGxvdHNfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aChtb2RlbHNfZGVzdCwgcGxvdHNfZGVzdCkKICAgIGlmIG1vZGVsX2V2YWx1YXRvcjoKICAgICAgICBldmFsX21ldHJpY3MgPSBtb2RlbF9ldmFsdWF0b3IoCiAgICAgICAgICAgIGNvbnRleHQsIHh2YWxpZCwgeXZhbGlkLCBtb2RlbCwgcGxvdHNfYXJ0aWZhY3RfcGF0aD1wbG90c19wYXRoCiAgICAgICAgKQogICAgZWxzZToKICAgICAgICBldmFsX21ldHJpY3MgPSBldmFsX21vZGVsX3YyKAogICAgICAgICAgICBjb250ZXh0LCB4dmFsaWQsIHl2YWxpZCwgbW9kZWwsIHBsb3RzX2FydGlmYWN0X3BhdGg9cGxvdHNfcGF0aAogICAgICAgICkKCiAgICBrd2FyZ3MgPSB7InRyYWluaW5nX3NldCI6IHRlc3Rfc2V0LCAibGFiZWxfY29sdW1uIjogbGFiZWxfY29sdW1ufQogICAgc3BsaXQgPSBtb2RlbF9wa2dfY2xhc3MucnNwbGl0KCIuIiwgMSkKICAgIGlmIHNwbGl0IGFuZCBsZW4oc3BsaXQpID09IDI6CiAgICAgICAga3dhcmdzWyJhbGdvcml0aG0iXSA9IHNwbGl0WzFdCgogICAgaWYgZGF0YXNldC5tZXRhIGFuZCBkYXRhc2V0Lm1ldGEua2luZCA9PSAiRmVhdHVyZVZlY3RvciI6CiAgICAgICAga3dhcmdzWyJmZWF0dXJlX3ZlY3RvciJdID0gZGF0YXNldC5tZXRhLnVyaQoKICAgIGNvbnRleHQuc2V0X2xhYmVsKCJjbGFzcyIsIG1vZGVsX3BrZ19jbGFzcykKICAgIGNvbnRleHQubG9nX21vZGVsKAogICAgICAgICJtb2RlbCIsCiAgICAgICAgYm9keT1kdW1wcyhtb2RlbCksCiAgICAgICAgYXJ0aWZhY3RfcGF0aD1hcnRpZmFjdF9wYXRoLAogICAgICAgIGV4dHJhX2RhdGE9ZXZhbF9tZXRyaWNzLAogICAgICAgIG1vZGVsX2ZpbGU9Im1vZGVsLnBrbCIsCiAgICAgICAgbWV0cmljcz1jb250ZXh0LnJlc3VsdHMsCiAgICAgICAgbGFiZWxzPXsiY2xhc3MiOiBtb2RlbF9wa2dfY2xhc3N9LAogICAgICAgIGZyYW1ld29yaz0ic2tsZWFybiIsCiAgICAgICAgKiprd2FyZ3MKICAgICkK + functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Generated by nuclio.export.NuclioExporter

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)


from cloudpickle import dumps
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.utils.helpers import create_class


def get_sample(dataset: DataItem, sample: int, label_column: str) -> Tuple[pd.DataFrame, pd.Series, list]:
    """Get a sample of the dataset with labels separated.

    :param dataset: DataItem containing the dataset
    :param sample: Number of samples to take. If -1, use all. If < -1, take random sample.
    :param label_column: Name of the label column
    """
    df = dataset.as_df()

    if sample == -1:
        sampled_df = df
    elif sample < -1:
        sampled_df = df.sample(n=abs(sample), random_state=1)
    else:
        sampled_df = df.head(sample)

    labels = sampled_df[label_column]
    features = sampled_df.drop(label_column, axis=1)
    header = list(features.columns)

    return features, labels, header


def get_splits(
    features: pd.DataFrame,
    labels: pd.Series,
    num_splits: int,
    test_size: float,
    val_size: float,
    random_state: int = 1
) -> List[Tuple[pd.DataFrame, pd.Series]]:
    """Split data into train, validation, and test sets.

    :param features: Feature DataFrame
    :param labels: Labels Series
    :param num_splits: Number of splits (3 for train/val/test)
    :param test_size: Proportion for test set
    :param val_size: Proportion of remaining data for validation
    :param random_state: Random seed
    """
    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        features, labels, test_size=test_size, random_state=random_state
    )

    # Second split: separate train and validation from remaining data
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size, random_state=random_state
    )

    return [(X_train, y_train), (X_val, y_val), (X_test, y_test)]


def gen_sklearn_model(model_pkg_class: str, parameters: list) -> dict:
    """Generate sklearn model configuration from class name and parameters.

    :param model_pkg_class: Full class path (e.g., "sklearn.ensemble.RandomForestClassifier")
    :param parameters: List of (key, value) parameter tuples
    """
    config = {
        "META": {"class": model_pkg_class},
        "CLASS": {},
        "FIT": {}
    }

    # Parameters that should not be passed to sklearn model
    excluded_params = {
        'model_pkg_class', 'dataset', 'label_column', 'encode_cols',
        'sample', 'test_size', 'train_val_split', 'test_set_key',
        'model_evaluator', 'models_dest', 'plots_dest', 'file_ext',
        'model_pkg_file', 'context'
    }

    # Separate parameters into model init params and fit params
    for key, value in parameters:
        if key in ['X', 'y', 'sample_weight']:
            config["FIT"][key] = value
        elif key not in excluded_params:
            # Only add parameters that are not function-specific
            config["CLASS"][key] = value

    return config


def eval_model_v2(
    context: MLClientCtx,
    xvalid: pd.DataFrame,
    yvalid: pd.Series,
    model,
    plots_artifact_path: str = None
) -> dict:
    """Evaluate a sklearn classifier model.

    :param context: MLRun context
    :param xvalid: Validation features
    :param yvalid: Validation labels
    :param model: Trained sklearn model
    :param plots_artifact_path: Path for plots (not used in this simplified version)
    """
    y_pred = model.predict(xvalid)

    metrics = {
        "accuracy": accuracy_score(yvalid, y_pred),
        "precision": precision_score(yvalid, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(yvalid, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(yvalid, y_pred, average='weighted', zero_division=0)
    }

    # Log metrics to context
    for key, value in metrics.items():
        context.log_result(key, value)

    return {}


def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    encode_cols: List[str] = [],
    sample: int = -1,
    test_size: float = 0.30,
    train_val_split: float = 0.70,
    test_set_key: str = "test_set",
    model_evaluator=None,
    models_dest: str = "",
    plots_dest: str = "plots",
    file_ext: str = "parquet",
    model_pkg_file: str = "",
    random_state: int = 1,
) -> None:
    """train a classifier

    An optional cutom model evaluator can be supplied that should have the signature:
    `my_custom_evaluator(context, xvalid, yvalid, model)` and return a dictionary of
    scalar "results", a "plots" keys with a list of PlotArtifacts, and
    and "tables" key containing a returned list of TableArtifacts.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier",
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param test_set_key:      key of held out data in artifact store
    :param model_evaluator:   (None) a custom model evaluator can be specified
    :param models_dest:       ("") models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param file_ext:          ("parquet") format for test_set_key hold out data
    :param random_state:      (1) sklearn rng seed

    """
    models_dest = models_dest or "model"

    raw, labels, header = get_sample(dataset, sample, label_column)

    if encode_cols:
        raw = pd.get_dummies(
            raw,
            columns=list(encode_cols.keys()),
            prefix=list(encode_cols.values()),
            drop_first=True,
        )

    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits(
        raw, labels, 3, test_size, 1 - train_val_split, random_state
    )

    test_set = pd.concat([xtest, ytest.to_frame()], axis=1)
    context.log_dataset(
        test_set_key,
        df=test_set,
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath("data"),
    )

    model_config = gen_sklearn_model(model_pkg_class, context.parameters.items())

    model_config["FIT"].update({"X": xtrain, "y": ytrain.values})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)
    plots_path = context.artifact_subpath(models_dest, plots_dest)
    if model_evaluator:
        eval_metrics = model_evaluator(
            context, xvalid, yvalid, model, plots_artifact_path=plots_path
        )
    else:
        eval_metrics = eval_model_v2(
            context, xvalid, yvalid, model, plots_artifact_path=plots_path
        )

    kwargs = {"training_set": test_set, "label_column": label_column}
    split = model_pkg_class.rsplit(".", 1)
    if split and len(split) == 2:
        kwargs["algorithm"] = split[1]

    if dataset.meta and dataset.meta.kind == "FeatureVector":
        kwargs["feature_vector"] = dataset.meta.uri

    context.set_label("class", model_pkg_class)
    context.log_model(
        "model",
        body=dumps(model),
        artifact_path=artifact_path,
        extra_data=eval_metrics,
        model_file="model.pkl",
        metrics=context.results,
        labels={"class": model_pkg_class},
        framework="sklearn",
        **kwargs
    )
 origin_filename: '' code_origin: '' - command: '' + disable_auto_mount: false + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/sklearn_classifier/sklearn_classifier.py + description: train any classifier using scikit-learn's API metadata: - tag: '' name: sklearn-classifier categories: - machine-learning - model-training + tag: '' verbose: false -kind: job diff --git a/functions/src/sklearn_classifier/item.yaml b/functions/src/sklearn_classifier/item.yaml index b9726fb79..4fa374938 100644 --- a/functions/src/sklearn_classifier/item.yaml +++ b/functions/src/sklearn_classifier/item.yaml @@ -13,7 +13,7 @@ labels: framework: sklearn maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.10.0 name: sklearn-classifier platformVersion: 3.5.3 spec: @@ -23,5 +23,5 @@ spec: kind: job requirements: [] url: '' -version: 1.2.0 +version: 1.3.0 test_valid: false diff --git a/functions/src/sklearn_classifier/requirements.txt b/functions/src/sklearn_classifier/requirements.txt index 4d9e097f9..97a565a9e 100644 --- a/functions/src/sklearn_classifier/requirements.txt +++ b/functions/src/sklearn_classifier/requirements.txt @@ -1,5 +1,5 @@ pandas -scikit-learn==1.0.2 +scikit-learn~=1.5 matplotlib seaborn scikit-plot diff --git a/functions/src/sklearn_classifier/sklearn_classifier.py b/functions/src/sklearn_classifier/sklearn_classifier.py index 1a73d4045..724f78356 100644 --- a/functions/src/sklearn_classifier/sklearn_classifier.py +++ b/functions/src/sklearn_classifier/sklearn_classifier.py @@ -21,14 +21,130 @@ from cloudpickle import dumps import pandas as pd -from typing import List +import numpy as np +from typing import List, Tuple +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from mlrun.execution import MLClientCtx from mlrun.datastore import DataItem -from mlrun.mlutils.data import get_sample, get_splits -from mlrun.mlutils.models import gen_sklearn_model, eval_model_v2 from mlrun.utils.helpers import create_class +def get_sample(dataset: DataItem, sample: int, label_column: str) -> Tuple[pd.DataFrame, pd.Series, list]: + """Get a sample of the dataset with labels separated. + + :param dataset: DataItem containing the dataset + :param sample: Number of samples to take. If -1, use all. If < -1, take random sample. + :param label_column: Name of the label column + """ + df = dataset.as_df() + + if sample == -1: + sampled_df = df + elif sample < -1: + sampled_df = df.sample(n=abs(sample), random_state=1) + else: + sampled_df = df.head(sample) + + labels = sampled_df[label_column] + features = sampled_df.drop(label_column, axis=1) + header = list(features.columns) + + return features, labels, header + + +def get_splits( + features: pd.DataFrame, + labels: pd.Series, + num_splits: int, + test_size: float, + val_size: float, + random_state: int = 1 +) -> List[Tuple[pd.DataFrame, pd.Series]]: + """Split data into train, validation, and test sets. + + :param features: Feature DataFrame + :param labels: Labels Series + :param num_splits: Number of splits (3 for train/val/test) + :param test_size: Proportion for test set + :param val_size: Proportion of remaining data for validation + :param random_state: Random seed + """ + # First split: separate test set + X_temp, X_test, y_temp, y_test = train_test_split( + features, labels, test_size=test_size, random_state=random_state + ) + + # Second split: separate train and validation from remaining data + X_train, X_val, y_train, y_val = train_test_split( + X_temp, y_temp, test_size=val_size, random_state=random_state + ) + + return [(X_train, y_train), (X_val, y_val), (X_test, y_test)] + + +def gen_sklearn_model(model_pkg_class: str, parameters: list) -> dict: + """Generate sklearn model configuration from class name and parameters. + + :param model_pkg_class: Full class path (e.g., "sklearn.ensemble.RandomForestClassifier") + :param parameters: List of (key, value) parameter tuples + """ + config = { + "META": {"class": model_pkg_class}, + "CLASS": {}, + "FIT": {} + } + + # Parameters that should not be passed to sklearn model + excluded_params = { + 'model_pkg_class', 'dataset', 'label_column', 'encode_cols', + 'sample', 'test_size', 'train_val_split', 'test_set_key', + 'model_evaluator', 'models_dest', 'plots_dest', 'file_ext', + 'model_pkg_file', 'context' + } + + # Separate parameters into model init params and fit params + for key, value in parameters: + if key in ['X', 'y', 'sample_weight']: + config["FIT"][key] = value + elif key not in excluded_params: + # Only add parameters that are not function-specific + config["CLASS"][key] = value + + return config + + +def eval_model_v2( + context: MLClientCtx, + xvalid: pd.DataFrame, + yvalid: pd.Series, + model, + plots_artifact_path: str = None +) -> dict: + """Evaluate a sklearn classifier model. + + :param context: MLRun context + :param xvalid: Validation features + :param yvalid: Validation labels + :param model: Trained sklearn model + :param plots_artifact_path: Path for plots (not used in this simplified version) + """ + y_pred = model.predict(xvalid) + + metrics = { + "accuracy": accuracy_score(yvalid, y_pred), + "precision": precision_score(yvalid, y_pred, average='weighted', zero_division=0), + "recall": recall_score(yvalid, y_pred, average='weighted', zero_division=0), + "f1_score": f1_score(yvalid, y_pred, average='weighted', zero_division=0) + } + + # Log metrics to context + for key, value in metrics.items(): + context.log_result(key, value) + + return {} + + def train_model( context: MLClientCtx, model_pkg_class: str, diff --git a/functions/src/sklearn_classifier/test_sklearn_classifier.py b/functions/src/sklearn_classifier/test_sklearn_classifier.py index 5c29e85b3..78afd623b 100644 --- a/functions/src/sklearn_classifier/test_sklearn_classifier.py +++ b/functions/src/sklearn_classifier/test_sklearn_classifier.py @@ -38,19 +38,29 @@ def test_import_sklearn_classifier(): params = {"model_pkg_class": "sklearn.ensemble.RandomForestClassifier", "label_column": "labels"} + # In local mode, artifacts are in function-name/iteration subdirectory + dataset_path = "./artifacts/gen-class-data-gen-class-data/0/classifier-data.csv" + assert os.path.exists(dataset_path), f"Dataset not found at {dataset_path}" + train_run = fn.run(params=params, - inputs={"dataset": acquire_run.status.artifacts[0]['spec']['target_path']}, + inputs={"dataset": dataset_path}, local=True, - artifact_path="./") + artifact_path="./artifacts") + + # Check that the run completed successfully + assert train_run.status.state == "completed", f"Run failed with state: {train_run.status.state}" + + # In local mode, check if model metrics were logged + assert "accuracy" in train_run.status.results or len(train_run.status.results) > 0, \ + "No metrics were logged" - for artifact in train_run.status.artifacts: - if artifact['kind'] == 'model': - assert os.path.exists(artifact['spec']['target_path']), 'Could not find model dir' - break + # In local mode, the model is saved to artifacts/model/function-name/iteration/model/model.pkl + model_path = "./artifacts/model/sklearn-classifier-train-model/0/model/model.pkl" + assert os.path.exists(model_path), f'Could not find model file at {model_path}' - assert os.path.exists(train_run.status.artifacts[0]['spec']['target_path']) - model = pickle.load(open(artifact['spec']['target_path'] + artifact['spec']['model_file'], 'rb')) - df = pd.read_csv(acquire_run.status.artifacts[0]['spec']['target_path']) + # Load the model and verify it can make predictions + model = pickle.load(open(model_path, 'rb')) + df = pd.read_csv(dataset_path) x = df.drop(['labels'], axis=1).iloc[0:1] y_true = df['labels'][0] y_pred = model.predict_proba(x).argmax() From 613d020f0136fb3b23dfb12e2fbd39b5822896e0 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Mon, 19 Jan 2026 16:50:09 +0200 Subject: [PATCH 02/15] remove filename --- functions/src/auto_trainer/function.yaml | 1 - functions/src/describe/function.yaml | 1 - functions/src/gen_class_data/function.yaml | 1 - functions/src/sklearn_classifier/function.yaml | 1 - 4 files changed, 4 deletions(-) diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index 3020b6521..e560064c3 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -8,7 +8,6 @@ spec: default_handler: train description: Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM. - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py command: '' disable_auto_mount: false entry_points: diff --git a/functions/src/describe/function.yaml b/functions/src/describe/function.yaml index 7116fae92..1c254c3c4 100644 --- a/functions/src/describe/function.yaml +++ b/functions/src/describe/function.yaml @@ -84,7 +84,6 @@ spec: - type: None name: analyze lineno: 46 - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/describe/describe.py build: origin_filename: '' code_origin: '' diff --git a/functions/src/gen_class_data/function.yaml b/functions/src/gen_class_data/function.yaml index fde89341e..fa802964e 100644 --- a/functions/src/gen_class_data/function.yaml +++ b/functions/src/gen_class_data/function.yaml @@ -65,7 +65,6 @@ spec: origin_filename: '' functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo= code_origin: '' - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/gen_class_data/gen_class_data.py command: '' image: mlrun/mlrun default_handler: gen_class_data diff --git a/functions/src/sklearn_classifier/function.yaml b/functions/src/sklearn_classifier/function.yaml index 603922c95..208497ecc 100644 --- a/functions/src/sklearn_classifier/function.yaml +++ b/functions/src/sklearn_classifier/function.yaml @@ -171,7 +171,6 @@ spec: origin_filename: '' code_origin: '' disable_auto_mount: false - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/sklearn_classifier/sklearn_classifier.py description: train any classifier using scikit-learn's API metadata: name: sklearn-classifier From 6256dcf6d6794c74e4ac04b288678c232279de98 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Mon, 19 Jan 2026 17:37:10 +0200 Subject: [PATCH 03/15] remove numpy import --- functions/src/sklearn_classifier/sklearn_classifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/functions/src/sklearn_classifier/sklearn_classifier.py b/functions/src/sklearn_classifier/sklearn_classifier.py index 724f78356..2fc60a102 100644 --- a/functions/src/sklearn_classifier/sklearn_classifier.py +++ b/functions/src/sklearn_classifier/sklearn_classifier.py @@ -21,7 +21,6 @@ from cloudpickle import dumps import pandas as pd -import numpy as np from typing import List, Tuple from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score From 227cc95c7e61b29c0c5a505f794cc14bb63efdbc Mon Sep 17 00:00:00 2001 From: tomerbv Date: Tue, 20 Jan 2026 13:57:27 +0200 Subject: [PATCH 04/15] revert sklearn.metrics monkey patch fix _get_dataframe to handle list/dict before accessing artifact_url added feature name preservation logic in predict function --- functions/src/auto_trainer/auto_trainer.py | 81 ++++++++++++------- functions/src/auto_trainer/function.yaml | 71 ++++++++-------- .../src/auto_trainer/test_auto_trainer.py | 31 ------- 3 files changed, 86 insertions(+), 97 deletions(-) diff --git a/functions/src/auto_trainer/auto_trainer.py b/functions/src/auto_trainer/auto_trainer.py index 7b4764700..ab2c6ee88 100755 --- a/functions/src/auto_trainer/auto_trainer.py +++ b/functions/src/auto_trainer/auto_trainer.py @@ -67,30 +67,14 @@ def _get_dataframe( Classification tasks. :param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop. """ - store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) - - # Getting the dataset: - if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: - label_columns = label_columns or dataset.meta.status.label_column - context.logger.info(f"label columns: {label_columns}") - # FeatureVector case: - try: - fv = mlrun.datastore.get_store_resource(dataset.artifact_url) - dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe() - except AttributeError: - # Leave here for backwards compatibility - dataset = fs.get_offline_features( - dataset.meta.uri, drop_columns=drop_columns - ).to_dataframe() - - elif not label_columns: - context.logger.info( - "label_columns not provided, mandatory when dataset is not a FeatureVector" - ) - raise ValueError - - elif isinstance(dataset, (list, dict)): + # Check if dataset is list/dict first (before trying to access artifact_url) + if isinstance(dataset, (list, dict)): # list/dict case: + if not label_columns: + context.logger.info( + "label_columns not provided, mandatory when dataset is not a FeatureVector" + ) + raise ValueError dataset = pd.DataFrame(dataset) # Checking if drop_columns provided by integer type: if drop_columns: @@ -103,17 +87,38 @@ def _get_dataframe( ) raise ValueError dataset.drop(drop_columns, axis=1, inplace=True) - else: - # simple URL case: - dataset = dataset.as_df() - if drop_columns: - if all(col in dataset for col in drop_columns): - dataset = dataset.drop(drop_columns, axis=1) - else: + # Dataset is a DataItem with artifact_url (URI or FeatureVector) + store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url) + + # Getting the dataset: + if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: + label_columns = label_columns or dataset.meta.status.label_column + context.logger.info(f"label columns: {label_columns}") + # FeatureVector case: + try: + fv = mlrun.datastore.get_store_resource(dataset.artifact_url) + dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe() + except AttributeError: + # Leave here for backwards compatibility + dataset = fs.get_offline_features( + dataset.meta.uri, drop_columns=drop_columns + ).to_dataframe() + else: + # simple URL case: + if not label_columns: context.logger.info( - "not all of the columns to drop in the dataset, drop columns process skipped" + "label_columns not provided, mandatory when dataset is not a FeatureVector" ) + raise ValueError + dataset = dataset.as_df() + if drop_columns: + if all(col in dataset for col in drop_columns): + dataset = dataset.drop(drop_columns, axis=1) + else: + context.logger.info( + "not all of the columns to drop in the dataset, drop columns process skipped" + ) return dataset, label_columns @@ -361,6 +366,20 @@ def predict( # loading the model, and getting the model handler: model_handler = AutoMLRun.load_model(model_path=model, context=context) + # Fix feature names for models that require them (e.g., XGBoost) + # When dataset comes from a list, pandas assigns default integer column names + # but some models expect specific feature names they were trained with + if hasattr(model_handler.model, 'feature_names_in_'): + expected_features = model_handler.model.feature_names_in_ + if len(dataset.columns) == len(expected_features): + # Only rename if the number of columns matches + # This handles the case where a list was converted to DataFrame with default column names + if not all(col == feat for col, feat in zip(dataset.columns, expected_features)): + context.logger.info( + f"Renaming dataset columns to match model's expected feature names" + ) + dataset.columns = expected_features + # Dropping label columns if necessary: if not label_columns: label_columns = [] diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index e560064c3..32e1f67dc 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -1,30 +1,15 @@ -kind: job +metadata: + name: auto-trainer + categories: + - machine-learning + - model-training + tag: '' spec: - build: - functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

    # Getting the dataset:
    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
        label_columns = label_columns or dataset.meta.status.label_column
        context.logger.info(f"label columns: {label_columns}")
        # FeatureVector case:
        try:
            fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
            dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
        except AttributeError:
            # Leave here for backwards compatibility
            dataset = fs.get_offline_features(
                dataset.meta.uri, drop_columns=drop_columns
            ).to_dataframe()

    elif not label_columns:
        context.logger.info(
            "label_columns not provided, mandatory when dataset is not a FeatureVector"
        )
        raise ValueError

    elif isinstance(dataset, (list, dict)):
        # list/dict case:
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)

    else:
        # simple URL case:
        dataset = dataset.as_df()
        if drop_columns:
            if all(col in dataset for col in drop_columns):
                dataset = dataset.drop(drop_columns, axis=1)
            else:
                context.logger.info(
                    "not all of the columns to drop in the dataset, drop columns process skipped"
                )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 - code_origin: '' - origin_filename: '' - image: mlrun/mlrun - default_handler: train - description: Automatic train, evaluate and predict functions for the ML frameworks - - Scikit-Learn, XGBoost and LightGBM. - command: '' - disable_auto_mount: false + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py entry_points: train: - lineno: 121 - has_varargs: false name: train - doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ - \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ - hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ - \ name=\"train\",\n handler=\"train\",\n inputs={\"dataset\"\ - : \"./path/to/dataset.csv\"},\n params={\n \"model_class\"\ - : \"sklearn.linear_model.LogisticRegression\",\n \"label_columns\"\ - : \"label\",\n \"drop_columns\": \"id\",\n \"model_name\"\ - : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ - : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ - ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" + lineno: 126 parameters: - name: context type: MLClientCtx @@ -79,11 +64,20 @@ spec: doc: Labels to log with the model default: null has_kwargs: true - evaluate: - lineno: 273 + doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ + \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ + hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ + \ name=\"train\",\n handler=\"train\",\n inputs={\"dataset\"\ + : \"./path/to/dataset.csv\"},\n params={\n \"model_class\"\ + : \"sklearn.linear_model.LogisticRegression\",\n \"label_columns\"\ + : \"label\",\n \"drop_columns\": \"id\",\n \"model_name\"\ + : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ + : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ + ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" has_varargs: false + evaluate: name: evaluate - doc: Evaluating a model. Artifacts generated by the MLHandler. + lineno: 278 parameters: - name: context type: MLClientCtx @@ -104,11 +98,11 @@ spec: Classification tasks. Mandatory when dataset is not a FeatureVector. default: null has_kwargs: true - predict: - lineno: 327 + doc: Evaluating a model. Artifacts generated by the MLHandler. has_varargs: false + predict: name: predict - doc: Predicting dataset by a model. + lineno: 332 parameters: - name: context type: MLClientCtx @@ -138,10 +132,17 @@ spec: to 'prediction'. default: null has_kwargs: true + doc: Predicting dataset by a model. + has_varargs: false + description: Automatic train, evaluate and predict functions for the ML frameworks + - Scikit-Learn, XGBoost and LightGBM. + command: '' + disable_auto_mount: false + image: mlrun/mlrun + default_handler: train + build: + code_origin: '' + origin_filename: '' + functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 +kind: job verbose: false -metadata: - name: auto-trainer - categories: - - machine-learning - - model-training - tag: '' diff --git a/functions/src/auto_trainer/test_auto_trainer.py b/functions/src/auto_trainer/test_auto_trainer.py index 4a517f112..06b553a35 100644 --- a/functions/src/auto_trainer/test_auto_trainer.py +++ b/functions/src/auto_trainer/test_auto_trainer.py @@ -25,37 +25,6 @@ make_regression, ) -# Monkey-patch sklearn metrics to fix MLRun compatibility with sklearn 1.5+ -# MLRun 1.10.0 calls metrics with the deprecated 'squared' parameter -import sklearn.metrics -from sklearn.metrics import ( - mean_squared_error as _original_mse, - mean_absolute_error as _original_mae, - median_absolute_error as _original_medae, -) - - -def _patched_mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average', squared=None): - """Wrapper for mean_squared_error that ignores the deprecated 'squared' parameter.""" - # In sklearn 1.4+, 'squared' parameter was removed. Always return MSE (not RMSE) - return _original_mse(y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput) - - -def _patched_mean_absolute_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average', squared=None): - """Wrapper for mean_absolute_error that ignores any 'squared' parameter.""" - return _original_mae(y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput) - - -def _patched_median_absolute_error(y_true, y_pred, multioutput='uniform_average', sample_weight=None, squared=None): - """Wrapper for median_absolute_error that ignores any 'squared' parameter.""" - return _original_medae(y_true, y_pred, multioutput=multioutput, sample_weight=sample_weight) - - -# Apply the patches -sklearn.metrics.mean_squared_error = _patched_mean_squared_error -sklearn.metrics.mean_absolute_error = _patched_mean_absolute_error -sklearn.metrics.median_absolute_error = _patched_median_absolute_error - MODELS = [ ("sklearn.linear_model.LinearRegression", "regression"), ("sklearn.ensemble.RandomForestClassifier", "classification"), From 1c2323c82edd81e619c78e380cafeeeeadbe1602 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Tue, 20 Jan 2026 14:20:24 +0200 Subject: [PATCH 05/15] revert mlrun version --- functions/src/auto_trainer/item.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functions/src/auto_trainer/item.yaml b/functions/src/auto_trainer/item.yaml index d397a79d6..78de92ca0 100755 --- a/functions/src/auto_trainer/item.yaml +++ b/functions/src/auto_trainer/item.yaml @@ -13,7 +13,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.10.0 +mlrunVersion: 1.7.0 name: auto_trainer platformVersion: 3.5.0 spec: From ef99df27936537f0f2dd3115072e128ba08a87d7 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Tue, 20 Jan 2026 14:35:00 +0200 Subject: [PATCH 06/15] revert get_or_create_project --- functions/src/auto_trainer/test_auto_trainer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functions/src/auto_trainer/test_auto_trainer.py b/functions/src/auto_trainer/test_auto_trainer.py index 06b553a35..9a1ff554c 100644 --- a/functions/src/auto_trainer/test_auto_trainer.py +++ b/functions/src/auto_trainer/test_auto_trainer.py @@ -82,7 +82,7 @@ def test_train(model: Tuple[str, str]): dataset, label_columns = _get_dataset(model[1]) is_test_passed = True - project = mlrun.get_or_create_project("auto-trainer-test", context="./") + project = mlrun.new_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") train_run = None @@ -119,7 +119,7 @@ def test_train_evaluate(model: Tuple[str, str]): dataset, label_columns = _get_dataset(model[1]) is_test_passed = True # Importing function: - project = mlrun.get_or_create_project("auto-trainer-test", context="./") + project = mlrun.new_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") temp_dir = tempfile.mkdtemp() @@ -172,7 +172,7 @@ def test_train_predict(model: Tuple[str, str]): df = pd.read_csv(dataset) sample = df.head().drop("labels", axis=1).values.tolist() # Importing function: - project = mlrun.get_or_create_project("auto-trainer-test", context="./") + project = mlrun.new_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") temp_dir = tempfile.mkdtemp() From da513958d3d7b6e1419eb71ff07d4687a7c5e993 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Tue, 20 Jan 2026 15:00:15 +0200 Subject: [PATCH 07/15] revert scikit-learn version --- functions/src/auto_trainer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index 4854d84fd..b14a0293c 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn~=1.5 +scikit-learn<1.4.0 xgboost<2.0.0 plotly From c89f1c34a3eac2ad6780f74881e3cf83a70fff54 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Tue, 20 Jan 2026 15:20:46 +0200 Subject: [PATCH 08/15] scikit-learn==1.5.2 mlrun v 1.10 --- functions/src/auto_trainer/function.yaml | 42 +++++++++---------- functions/src/auto_trainer/item.yaml | 2 +- functions/src/auto_trainer/requirements.txt | 2 +- .../src/auto_trainer/test_auto_trainer.py | 6 +-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index 32e1f67dc..e4a36cd86 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -1,15 +1,15 @@ metadata: - name: auto-trainer categories: - machine-learning - model-training + name: auto-trainer tag: '' spec: - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py + command: '' + description: Automatic train, evaluate and predict functions for the ML frameworks + - Scikit-Learn, XGBoost and LightGBM. entry_points: train: - name: train - lineno: 126 parameters: - name: context type: MLClientCtx @@ -63,7 +63,8 @@ spec: type: dict doc: Labels to log with the model default: null - has_kwargs: true + has_varargs: false + lineno: 126 doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ @@ -74,10 +75,9 @@ spec: : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" - has_varargs: false + has_kwargs: true + name: train evaluate: - name: evaluate - lineno: 278 parameters: - name: context type: MLClientCtx @@ -97,12 +97,12 @@ spec: doc: The target label(s) of the column(s) in the dataset. for Regression or Classification tasks. Mandatory when dataset is not a FeatureVector. default: null - has_kwargs: true - doc: Evaluating a model. Artifacts generated by the MLHandler. has_varargs: false + lineno: 278 + doc: Evaluating a model. Artifacts generated by the MLHandler. + has_kwargs: true + name: evaluate predict: - name: predict - lineno: 332 parameters: - name: context type: MLClientCtx @@ -131,18 +131,18 @@ spec: doc: The db key to set name of the prediction result and the filename. Default to 'prediction'. default: null - has_kwargs: true - doc: Predicting dataset by a model. has_varargs: false - description: Automatic train, evaluate and predict functions for the ML frameworks - - Scikit-Learn, XGBoost and LightGBM. - command: '' - disable_auto_mount: false - image: mlrun/mlrun - default_handler: train + lineno: 332 + doc: Predicting dataset by a model. + has_kwargs: true + name: predict build: - code_origin: '' origin_filename: '' + code_origin: '' functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 + default_handler: train + disable_auto_mount: false + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py + image: mlrun/mlrun kind: job verbose: false diff --git a/functions/src/auto_trainer/item.yaml b/functions/src/auto_trainer/item.yaml index 78de92ca0..d397a79d6 100755 --- a/functions/src/auto_trainer/item.yaml +++ b/functions/src/auto_trainer/item.yaml @@ -13,7 +13,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.10.0 name: auto_trainer platformVersion: 3.5.0 spec: diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index b14a0293c..274a97f82 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn<1.4.0 +scikit-learn==1.5.2 xgboost<2.0.0 plotly diff --git a/functions/src/auto_trainer/test_auto_trainer.py b/functions/src/auto_trainer/test_auto_trainer.py index 9a1ff554c..06b553a35 100644 --- a/functions/src/auto_trainer/test_auto_trainer.py +++ b/functions/src/auto_trainer/test_auto_trainer.py @@ -82,7 +82,7 @@ def test_train(model: Tuple[str, str]): dataset, label_columns = _get_dataset(model[1]) is_test_passed = True - project = mlrun.new_project("auto-trainer-test", context="./") + project = mlrun.get_or_create_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") train_run = None @@ -119,7 +119,7 @@ def test_train_evaluate(model: Tuple[str, str]): dataset, label_columns = _get_dataset(model[1]) is_test_passed = True # Importing function: - project = mlrun.new_project("auto-trainer-test", context="./") + project = mlrun.get_or_create_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") temp_dir = tempfile.mkdtemp() @@ -172,7 +172,7 @@ def test_train_predict(model: Tuple[str, str]): df = pd.read_csv(dataset) sample = df.head().drop("labels", axis=1).values.tolist() # Importing function: - project = mlrun.new_project("auto-trainer-test", context="./") + project = mlrun.get_or_create_project("auto-trainer-test", context="./") fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun") temp_dir = tempfile.mkdtemp() From 63b968c3409801c25964137995a0e00af23bdf40 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Tue, 20 Jan 2026 15:34:13 +0200 Subject: [PATCH 09/15] scikit-learn==1.4.2 --- functions/src/auto_trainer/function.yaml | 36 ++++++++++----------- functions/src/auto_trainer/requirements.txt | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index e4a36cd86..04b9975c1 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -1,13 +1,11 @@ metadata: + tag: '' + name: auto-trainer categories: - machine-learning - model-training - name: auto-trainer - tag: '' spec: - command: '' - description: Automatic train, evaluate and predict functions for the ML frameworks - - Scikit-Learn, XGBoost and LightGBM. + default_handler: train entry_points: train: parameters: @@ -63,7 +61,8 @@ spec: type: dict doc: Labels to log with the model default: null - has_varargs: false + name: train + has_kwargs: true lineno: 126 doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ @@ -75,8 +74,7 @@ spec: : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" - has_kwargs: true - name: train + has_varargs: false evaluate: parameters: - name: context @@ -97,11 +95,11 @@ spec: doc: The target label(s) of the column(s) in the dataset. for Regression or Classification tasks. Mandatory when dataset is not a FeatureVector. default: null - has_varargs: false + name: evaluate + has_kwargs: true lineno: 278 doc: Evaluating a model. Artifacts generated by the MLHandler. - has_kwargs: true - name: evaluate + has_varargs: false predict: parameters: - name: context @@ -131,18 +129,20 @@ spec: doc: The db key to set name of the prediction result and the filename. Default to 'prediction'. default: null - has_varargs: false + name: predict + has_kwargs: true lineno: 332 doc: Predicting dataset by a model. - has_kwargs: true - name: predict + has_varargs: false + command: '' + image: mlrun/mlrun + description: Automatic train, evaluate and predict functions for the ML frameworks + - Scikit-Learn, XGBoost and LightGBM. + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py build: origin_filename: '' - code_origin: '' functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 - default_handler: train + code_origin: '' disable_auto_mount: false - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py - image: mlrun/mlrun kind: job verbose: false diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index 274a97f82..80346561b 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn==1.5.2 +scikit-learn==1.4.2 xgboost<2.0.0 plotly From dc2a7663682984095f8023b8f5ed52c1cb767545 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Wed, 21 Jan 2026 11:44:50 +0200 Subject: [PATCH 10/15] revert scikit-learn<1.4.0 --- functions/src/auto_trainer/function.yaml | 60 ++++++++++----------- functions/src/auto_trainer/item.yaml | 2 +- functions/src/auto_trainer/requirements.txt | 2 +- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index 04b9975c1..c879673f6 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -1,13 +1,32 @@ +kind: job metadata: - tag: '' name: auto-trainer categories: - machine-learning - model-training + tag: '' +verbose: false spec: - default_handler: train + image: mlrun/mlrun + description: Automatic train, evaluate and predict functions for the ML frameworks + - Scikit-Learn, XGBoost and LightGBM. + disable_auto_mount: false + filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py entry_points: train: + name: train + lineno: 126 + doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ + \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ + hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ + \ name=\"train\",\n handler=\"train\",\n inputs={\"dataset\"\ + : \"./path/to/dataset.csv\"},\n params={\n \"model_class\"\ + : \"sklearn.linear_model.LogisticRegression\",\n \"label_columns\"\ + : \"label\",\n \"drop_columns\": \"id\",\n \"model_name\"\ + : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ + : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ + ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" + has_varargs: false parameters: - name: context type: MLClientCtx @@ -61,21 +80,12 @@ spec: type: dict doc: Labels to log with the model default: null - name: train has_kwargs: true - lineno: 126 - doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ - \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ - hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ - \ name=\"train\",\n handler=\"train\",\n inputs={\"dataset\"\ - : \"./path/to/dataset.csv\"},\n params={\n \"model_class\"\ - : \"sklearn.linear_model.LogisticRegression\",\n \"label_columns\"\ - : \"label\",\n \"drop_columns\": \"id\",\n \"model_name\"\ - : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ - : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ - ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" - has_varargs: false evaluate: + name: evaluate + lineno: 278 + doc: Evaluating a model. Artifacts generated by the MLHandler. + has_varargs: false parameters: - name: context type: MLClientCtx @@ -95,12 +105,12 @@ spec: doc: The target label(s) of the column(s) in the dataset. for Regression or Classification tasks. Mandatory when dataset is not a FeatureVector. default: null - name: evaluate has_kwargs: true - lineno: 278 - doc: Evaluating a model. Artifacts generated by the MLHandler. - has_varargs: false predict: + name: predict + lineno: 332 + doc: Predicting dataset by a model. + has_varargs: false parameters: - name: context type: MLClientCtx @@ -129,20 +139,10 @@ spec: doc: The db key to set name of the prediction result and the filename. Default to 'prediction'. default: null - name: predict has_kwargs: true - lineno: 332 - doc: Predicting dataset by a model. - has_varargs: false command: '' - image: mlrun/mlrun - description: Automatic train, evaluate and predict functions for the ML frameworks - - Scikit-Learn, XGBoost and LightGBM. - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py + default_handler: train build: origin_filename: '' functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 code_origin: '' - disable_auto_mount: false -kind: job -verbose: false diff --git a/functions/src/auto_trainer/item.yaml b/functions/src/auto_trainer/item.yaml index d397a79d6..78de92ca0 100755 --- a/functions/src/auto_trainer/item.yaml +++ b/functions/src/auto_trainer/item.yaml @@ -13,7 +13,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.10.0 +mlrunVersion: 1.7.0 name: auto_trainer platformVersion: 3.5.0 spec: diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index 80346561b..b14a0293c 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn==1.4.2 +scikit-learn<1.4.0 xgboost<2.0.0 plotly From 514f4a480098d24702785165baaad4676a187f1c Mon Sep 17 00:00:00 2001 From: tomerbv Date: Wed, 21 Jan 2026 11:48:57 +0200 Subject: [PATCH 11/15] scikit-learn~=1.5 --- functions/src/auto_trainer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index b14a0293c..4854d84fd 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn<1.4.0 +scikit-learn~=1.5 xgboost<2.0.0 plotly From 1f76feb1fbce477f0419b1a2f59eb4f0d59f52cc Mon Sep 17 00:00:00 2001 From: tomerbv Date: Wed, 21 Jan 2026 11:58:13 +0200 Subject: [PATCH 12/15] mlrun 1.10 with scikit-learn<1.4.0 --- functions/src/auto_trainer/function.yaml | 47 ++++++++++----------- functions/src/auto_trainer/item.yaml | 2 +- functions/src/auto_trainer/requirements.txt | 2 +- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index c879673f6..155e5c58e 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -1,21 +1,13 @@ kind: job -metadata: - name: auto-trainer - categories: - - machine-learning - - model-training - tag: '' verbose: false spec: - image: mlrun/mlrun - description: Automatic train, evaluate and predict functions for the ML frameworks - - Scikit-Learn, XGBoost and LightGBM. disable_auto_mount: false - filename: /Users/Tomer_Weitzman/PycharmProjects/functions/functions/src/auto_trainer/auto_trainer.py + build: + functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 + origin_filename: '' + code_origin: '' entry_points: train: - name: train - lineno: 126 doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ \ project = mlrun.get_or_create_project(\"my-project\")\n project.set_function(\"\ hub://auto_trainer\", \"train\")\n trainer_run = project.run(\n \ @@ -26,7 +18,6 @@ spec: : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" - has_varargs: false parameters: - name: context type: MLClientCtx @@ -80,12 +71,12 @@ spec: type: dict doc: Labels to log with the model default: null + name: train + lineno: 126 + has_varargs: false has_kwargs: true evaluate: - name: evaluate - lineno: 278 doc: Evaluating a model. Artifacts generated by the MLHandler. - has_varargs: false parameters: - name: context type: MLClientCtx @@ -105,12 +96,12 @@ spec: doc: The target label(s) of the column(s) in the dataset. for Regression or Classification tasks. Mandatory when dataset is not a FeatureVector. default: null + name: evaluate + lineno: 278 + has_varargs: false has_kwargs: true predict: - name: predict - lineno: 332 doc: Predicting dataset by a model. - has_varargs: false parameters: - name: context type: MLClientCtx @@ -139,10 +130,18 @@ spec: doc: The db key to set name of the prediction result and the filename. Default to 'prediction'. default: null + name: predict + lineno: 332 + has_varargs: false has_kwargs: true - command: '' + description: Automatic train, evaluate and predict functions for the ML frameworks + - Scikit-Learn, XGBoost and LightGBM. default_handler: train - build: - origin_filename: '' - functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 - code_origin: '' + image: mlrun/mlrun + command: '' +metadata: + tag: '' + name: auto-trainer + categories: + - machine-learning + - model-training diff --git a/functions/src/auto_trainer/item.yaml b/functions/src/auto_trainer/item.yaml index 78de92ca0..d397a79d6 100755 --- a/functions/src/auto_trainer/item.yaml +++ b/functions/src/auto_trainer/item.yaml @@ -13,7 +13,7 @@ labels: author: Iguazio maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.10.0 name: auto_trainer platformVersion: 3.5.0 spec: diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index 4854d84fd..b14a0293c 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn~=1.5 +scikit-learn<1.4.0 xgboost<2.0.0 plotly From ace141e84f2df82048a22d63ab993bdbc381ad11 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Wed, 21 Jan 2026 15:50:10 +0200 Subject: [PATCH 13/15] scikit-learn strict v~=1.5.2 added skip for test_train in test_auto_trainer.py --- functions/src/auto_trainer/function.yaml | 34 +++++++++---------- functions/src/auto_trainer/requirements.txt | 2 +- .../src/auto_trainer/test_auto_trainer.py | 4 +++ functions/src/describe/requirements.txt | 2 +- functions/src/gen_class_data/requirements.txt | 2 +- .../src/sklearn_classifier/requirements.txt | 2 +- 6 files changed, 25 insertions(+), 21 deletions(-) diff --git a/functions/src/auto_trainer/function.yaml b/functions/src/auto_trainer/function.yaml index 155e5c58e..50a36e750 100644 --- a/functions/src/auto_trainer/function.yaml +++ b/functions/src/auto_trainer/function.yaml @@ -1,11 +1,6 @@ -kind: job verbose: false +kind: job spec: - disable_auto_mount: false - build: - functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 - origin_filename: '' - code_origin: '' entry_points: train: doc: "Training a model with the given dataset.\n\nexample::\n\n import mlrun\n\ @@ -18,6 +13,7 @@ spec: : \"my-model\",\n \"tag\": \"v1.0.0\",\n \"sample_set\"\ : \"./path/to/sample_set.csv\",\n \"test_set\": \"./path/to/test_set.csv\"\ ,\n \"CLASS_solver\": \"liblinear\",\n },\n )" + has_kwargs: true parameters: - name: context type: MLClientCtx @@ -71,12 +67,12 @@ spec: type: dict doc: Labels to log with the model default: null - name: train lineno: 126 + name: train has_varargs: false - has_kwargs: true evaluate: doc: Evaluating a model. Artifacts generated by the MLHandler. + has_kwargs: true parameters: - name: context type: MLClientCtx @@ -96,12 +92,12 @@ spec: doc: The target label(s) of the column(s) in the dataset. for Regression or Classification tasks. Mandatory when dataset is not a FeatureVector. default: null - name: evaluate lineno: 278 + name: evaluate has_varargs: false - has_kwargs: true predict: doc: Predicting dataset by a model. + has_kwargs: true parameters: - name: context type: MLClientCtx @@ -130,18 +126,22 @@ spec: doc: The db key to set name of the prediction result and the filename. Default to 'prediction'. default: null - name: predict lineno: 332 + name: predict has_varargs: false - has_kwargs: true - description: Automatic train, evaluate and predict functions for the ML frameworks - - Scikit-Learn, XGBoost and LightGBM. + build: + code_origin: '' + origin_filename: '' + functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import mlrun
import mlrun.datastore
import mlrun.utils
import pandas as pd
from mlrun import feature_store as fs
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx
from mlrun.frameworks.auto_mlrun import AutoMLRun
from mlrun.utils.helpers import create_class, create_function
from sklearn.model_selection import train_test_split

PathType = Union[str, Path]


class KWArgsPrefixes:
    MODEL_CLASS = "CLASS_"
    FIT = "FIT_"
    TRAIN = "TRAIN_"


def _get_sub_dict_by_prefix(src: Dict, prefix_key: str) -> Dict[str, Any]:
    """
    Collect all the keys from the given dict that starts with the given prefix and creates a new dictionary with these
    keys.

    :param src:         The source dict to extract the values from.
    :param prefix_key:  Only keys with this prefix will be returned. The keys in the result dict will be without this
                        prefix.
    """
    return {
        key.replace(prefix_key, ""): val
        for key, val in src.items()
        if key.startswith(prefix_key)
    }


def _get_dataframe(
    context: MLClientCtx,
    dataset: DataItem,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: Union[str, List[str], int, List[int]] = None,
) -> Tuple[pd.DataFrame, Optional[Union[str, List[str]]]]:
    """
    Getting the DataFrame of the dataset and drop the columns accordingly.

    :param context:         MLRun context.
    :param dataset:         The dataset to train the model on.
                            Can be either a list of lists, dict, URI or a FeatureVector.
    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
                            Classification tasks.
    :param drop_columns:    str/int or a list of strings/ints that represent the column names/indices to drop.
    """
    # Check if dataset is list/dict first (before trying to access artifact_url)
    if isinstance(dataset, (list, dict)):
        # list/dict case:
        if not label_columns:
            context.logger.info(
                "label_columns not provided, mandatory when dataset is not a FeatureVector"
            )
            raise ValueError
        dataset = pd.DataFrame(dataset)
        # Checking if drop_columns provided by integer type:
        if drop_columns:
            if isinstance(drop_columns, str) or (
                isinstance(drop_columns, list)
                and any(isinstance(col, str) for col in drop_columns)
            ):
                context.logger.error(
                    "drop_columns must be an integer/list of integers if not provided with a URI/FeatureVector dataset"
                )
                raise ValueError
            dataset.drop(drop_columns, axis=1, inplace=True)
    else:
        # Dataset is a DataItem with artifact_url (URI or FeatureVector)
        store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)

        # Getting the dataset:
        if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
            label_columns = label_columns or dataset.meta.status.label_column
            context.logger.info(f"label columns: {label_columns}")
            # FeatureVector case:
            try:
                fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
                dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
            except AttributeError:
                # Leave here for backwards compatibility
                dataset = fs.get_offline_features(
                    dataset.meta.uri, drop_columns=drop_columns
                ).to_dataframe()
        else:
            # simple URL case:
            if not label_columns:
                context.logger.info(
                    "label_columns not provided, mandatory when dataset is not a FeatureVector"
                )
                raise ValueError
            dataset = dataset.as_df()
            if drop_columns:
                if all(col in dataset for col in drop_columns):
                    dataset = dataset.drop(drop_columns, axis=1)
                else:
                    context.logger.info(
                        "not all of the columns to drop in the dataset, drop columns process skipped"
                    )

    return dataset, label_columns


def train(
    context: MLClientCtx,
    dataset: DataItem,
    model_class: str,
    label_columns: Optional[Union[str, List[str]]] = None,
    drop_columns: List[str] = None,
    model_name: str = "model",
    tag: str = "",
    sample_set: DataItem = None,
    test_set: DataItem = None,
    train_test_split_size: float = None,
    random_state: int = None,
    labels: dict = None,
    **kwargs,
):
    """
    Training a model with the given dataset.

    example::

        import mlrun
        project = mlrun.get_or_create_project("my-project")
        project.set_function("hub://auto_trainer", "train")
        trainer_run = project.run(
            name="train",
            handler="train",
            inputs={"dataset": "./path/to/dataset.csv"},
            params={
                "model_class": "sklearn.linear_model.LogisticRegression",
                "label_columns": "label",
                "drop_columns": "id",
                "model_name": "my-model",
                "tag": "v1.0.0",
                "sample_set": "./path/to/sample_set.csv",
                "test_set": "./path/to/test_set.csv",
                "CLASS_solver": "liblinear",
            },
        )

    :param context:                 MLRun context
    :param dataset:                 The dataset to train the model on. Can be either a URI or a FeatureVector
    :param model_class:             The class of the model, e.g. `sklearn.linear_model.LogisticRegression`
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop
    :param model_name:              The model's name to use for storing the model artifact, default to 'model'
    :param tag:                     The model's tag to log with
    :param sample_set:              A sample set of inputs for the model for logging its stats along the model in favour
                                    of model monitoring. Can be either a URI or a FeatureVector
    :param test_set:                The test set to train the model with.
    :param train_test_split_size:   if test_set was provided then this argument is ignored.
                                    Should be between 0.0 and 1.0 and represent the proportion of the dataset to include
                                    in the test split. The size of the Training set is set to the complement of this
                                    value. Default = 0.2
    :param random_state:            Relevant only when using train_test_split_size.
                                    A random state seed to shuffle the data. For more information, see:
                                    https://scikit-learn.org/stable/glossary.html#term-random_state
                                    Notice that here we only pass integer values.
    :param labels:                  Labels to log with the model
    :param kwargs:                  Here you can pass keyword arguments with prefixes,
                                    that will be parsed and passed to the relevant function, by the following prefixes:
                                    - `CLASS_` - for the model class arguments
                                    - `FIT_` - for the `fit` function arguments
                                    - `TRAIN_` - for the `train` function (in xgb or lgbm train function - future)

    """
    # Validate inputs:
    # Check if exactly one of them is supplied:
    if test_set is None:
        if train_test_split_size is None:
            context.logger.info(
                "test_set or train_test_split_size are not provided, setting train_test_split_size to 0.2"
            )
            train_test_split_size = 0.2

    elif train_test_split_size:
        context.logger.info(
            "test_set provided, ignoring given train_test_split_size value"
        )
        train_test_split_size = None

    # Get DataFrame by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Getting the sample set:
    if sample_set is None:
        context.logger.info(
            f"Sample set not given, using the whole training set as the sample set"
        )
        sample_set = dataset
    else:
        sample_set, _ = _get_dataframe(
            context=context,
            dataset=sample_set,
            label_columns=label_columns,
            drop_columns=drop_columns,
        )

    # Parsing kwargs:
    # TODO: Use in xgb or lgbm train function.
    train_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.TRAIN)
    fit_kwargs = _get_sub_dict_by_prefix(src=kwargs, prefix_key=KWArgsPrefixes.FIT)
    model_class_kwargs = _get_sub_dict_by_prefix(
        src=kwargs, prefix_key=KWArgsPrefixes.MODEL_CLASS
    )

    # Check if model or function:
    if hasattr(model_class, "train"):
        # TODO: Need to call: model(), afterwards to start the train function.
        # model = create_function(f"{model_class}.train")
        raise NotImplementedError
    else:
        # Creating model instance:
        model = create_class(model_class)(**model_class_kwargs)

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]
    if train_test_split_size:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=train_test_split_size, random_state=random_state
        )
    else:
        x_train, y_train = x, y

        test_set = test_set.as_df()
        if drop_columns:
            test_set = dataset.drop(drop_columns, axis=1)

        x_test, y_test = test_set.drop(label_columns, axis=1), test_set[label_columns]

    AutoMLRun.apply_mlrun(
        model=model,
        model_name=model_name,
        context=context,
        tag=tag,
        sample_set=sample_set,
        y_columns=label_columns,
        test_set=test_set,
        x_test=x_test,
        y_test=y_test,
        artifacts=context.artifacts,
        labels=labels,
    )
    context.logger.info(f"training '{model_name}'")
    model.fit(x_train, y_train, **fit_kwargs)


def evaluate(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: List[str] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    **kwargs,
):
    """
    Evaluating a model. Artifacts generated by the MLHandler.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to evaluate the model on. Can be either a URI or a FeatureVector.
    :param drop_columns:            str or a list of strings that represent the columns to drop.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # Parsing label_columns:
    parsed_label_columns = []
    if label_columns:
        label_columns = (
            label_columns if isinstance(label_columns, list) else [label_columns]
        )
        for lc in label_columns:
            if fs.common.feature_separator in lc:
                feature_set_name, label_name, alias = fs.common.parse_feature_string(lc)
                parsed_label_columns.append(alias or label_name)
        if parsed_label_columns:
            label_columns = parsed_label_columns

    x = dataset.drop(label_columns, axis=1)
    y = dataset[label_columns]

    # Loading the model and predicting:
    model_handler = AutoMLRun.load_model(
        model_path=model, context=context, model_name="model_LinearRegression"
    )
    AutoMLRun.apply_mlrun(model_handler.model, y_test=y, model_path=model)

    context.logger.info(f"evaluating '{model_handler.model_name}'")
    model_handler.model.predict(x, **kwargs)


def predict(
    context: MLClientCtx,
    model: str,
    dataset: mlrun.DataItem,
    drop_columns: Union[str, List[str], int, List[int]] = None,
    label_columns: Optional[Union[str, List[str]]] = None,
    result_set: Optional[str] = None,
    **kwargs,
):
    """
    Predicting dataset by a model.

    :param context:                 MLRun context.
    :param model:                   The model Store path.
    :param dataset:                 The dataset to predict the model on. Can be either a URI, a FeatureVector or a
                                    sample in a shape of a list/dict.
                                    When passing a sample, pass the dataset as a field in `params` instead of `inputs`.
    :param drop_columns:            str/int or a list of strings/ints that represent the column names/indices to drop.
                                    When the dataset is a list/dict this parameter should be represented by integers.
    :param label_columns:           The target label(s) of the column(s) in the dataset. for Regression or
                                    Classification tasks. Mandatory when dataset is not a FeatureVector.
    :param result_set:              The db key to set name of the prediction result and the filename.
                                    Default to 'prediction'.
    :param kwargs:                  Here you can pass keyword arguments to the predict function
                                    (PREDICT_ prefix is not required).
    """
    # Get dataset by URL or by FeatureVector:
    dataset, label_columns = _get_dataframe(
        context=context,
        dataset=dataset,
        label_columns=label_columns,
        drop_columns=drop_columns,
    )

    # loading the model, and getting the model handler:
    model_handler = AutoMLRun.load_model(model_path=model, context=context)

    # Fix feature names for models that require them (e.g., XGBoost)
    # When dataset comes from a list, pandas assigns default integer column names
    # but some models expect specific feature names they were trained with
    if hasattr(model_handler.model, 'feature_names_in_'):
        expected_features = model_handler.model.feature_names_in_
        if len(dataset.columns) == len(expected_features):
            # Only rename if the number of columns matches
            # This handles the case where a list was converted to DataFrame with default column names
            if not all(col == feat for col, feat in zip(dataset.columns, expected_features)):
                context.logger.info(
                    f"Renaming dataset columns to match model's expected feature names"
                )
                dataset.columns = expected_features

    # Dropping label columns if necessary:
    if not label_columns:
        label_columns = []
    elif isinstance(label_columns, str):
        label_columns = [label_columns]

    # Predicting:
    context.logger.info(f"making prediction by '{model_handler.model_name}'")
    y_pred = model_handler.model.predict(dataset, **kwargs)

    # Preparing and validating label columns for the dataframe of the prediction result:
    num_predicted = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]

    if num_predicted > len(label_columns):
        if num_predicted == 1:
            label_columns = ["predicted labels"]
        else:
            label_columns.extend(
                [
                    f"predicted_label_{i + 1 + len(label_columns)}"
                    for i in range(num_predicted - len(label_columns))
                ]
            )
    elif num_predicted < len(label_columns):
        context.logger.error(
            f"number of predicted labels: {num_predicted} is smaller than number of label columns: {len(label_columns)}"
        )
        raise ValueError

    artifact_name = result_set or "prediction"
    labels_inside_df = set(label_columns) & set(dataset.columns.tolist())
    if labels_inside_df:
        context.logger.error(
            f"The labels: {labels_inside_df} are already existed in the dataframe"
        )
        raise ValueError
    pred_df = pd.concat([dataset, pd.DataFrame(y_pred, columns=label_columns)], axis=1)
    context.log_dataset(artifact_name, pred_df, db_key=result_set)
 + command: '' default_handler: train image: mlrun/mlrun - command: '' + disable_auto_mount: false + description: Automatic train, evaluate and predict functions for the ML frameworks + - Scikit-Learn, XGBoost and LightGBM. metadata: - tag: '' - name: auto-trainer categories: - machine-learning - model-training + tag: '' + name: auto-trainer diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index b14a0293c..1f735026c 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas -scikit-learn<1.4.0 +scikit-learn~=1.5.2 xgboost<2.0.0 plotly diff --git a/functions/src/auto_trainer/test_auto_trainer.py b/functions/src/auto_trainer/test_auto_trainer.py index 06b553a35..2874f3368 100644 --- a/functions/src/auto_trainer/test_auto_trainer.py +++ b/functions/src/auto_trainer/test_auto_trainer.py @@ -78,6 +78,10 @@ def _assert_train_handler(train_run): @pytest.mark.parametrize("model", MODELS) +@pytest.mark.skipif( + condition=not _validate_environment_variables(), + reason="Project's environment variables are not set", +) def test_train(model: Tuple[str, str]): dataset, label_columns = _get_dataset(model[1]) is_test_passed = True diff --git a/functions/src/describe/requirements.txt b/functions/src/describe/requirements.txt index 7a15c8465..ac445e6d6 100644 --- a/functions/src/describe/requirements.txt +++ b/functions/src/describe/requirements.txt @@ -1,4 +1,4 @@ -scikit-learn~=1.5 +scikit-learn~=1.5.2 plotly~=5.23 pytest~=7.0.1 matplotlib~=3.5.1 diff --git a/functions/src/gen_class_data/requirements.txt b/functions/src/gen_class_data/requirements.txt index fc53d535f..e265290f6 100644 --- a/functions/src/gen_class_data/requirements.txt +++ b/functions/src/gen_class_data/requirements.txt @@ -1,2 +1,2 @@ pandas -scikit-learn~=1.5 \ No newline at end of file +scikit-learn~=1.5.2 \ No newline at end of file diff --git a/functions/src/sklearn_classifier/requirements.txt b/functions/src/sklearn_classifier/requirements.txt index 97a565a9e..113d4a02a 100644 --- a/functions/src/sklearn_classifier/requirements.txt +++ b/functions/src/sklearn_classifier/requirements.txt @@ -1,5 +1,5 @@ pandas -scikit-learn~=1.5 +scikit-learn~=1.5.2 matplotlib seaborn scikit-plot From 94ff0d5d1758858fa58aa85be397be8bd8fff6f4 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Wed, 11 Feb 2026 18:22:39 +0200 Subject: [PATCH 14/15] revert sklearn_classifier.py changes change XGBRegressor to LGBMRegressor --- functions/src/auto_trainer/auto_trainer.py | 14 -- functions/src/auto_trainer/requirements.txt | 2 +- .../src/auto_trainer/test_auto_trainer.py | 2 +- .../src/sklearn_classifier/function.yaml | 109 ++-------------- functions/src/sklearn_classifier/item.yaml | 4 +- .../src/sklearn_classifier/requirements.txt | 2 +- .../sklearn_classifier/sklearn_classifier.py | 121 +----------------- .../test_sklearn_classifier.py | 28 ++-- 8 files changed, 29 insertions(+), 253 deletions(-) diff --git a/functions/src/auto_trainer/auto_trainer.py b/functions/src/auto_trainer/auto_trainer.py index ab2c6ee88..4e53e5b7e 100755 --- a/functions/src/auto_trainer/auto_trainer.py +++ b/functions/src/auto_trainer/auto_trainer.py @@ -366,20 +366,6 @@ def predict( # loading the model, and getting the model handler: model_handler = AutoMLRun.load_model(model_path=model, context=context) - # Fix feature names for models that require them (e.g., XGBoost) - # When dataset comes from a list, pandas assigns default integer column names - # but some models expect specific feature names they were trained with - if hasattr(model_handler.model, 'feature_names_in_'): - expected_features = model_handler.model.feature_names_in_ - if len(dataset.columns) == len(expected_features): - # Only rename if the number of columns matches - # This handles the case where a list was converted to DataFrame with default column names - if not all(col == feat for col, feat in zip(dataset.columns, expected_features)): - context.logger.info( - f"Renaming dataset columns to match model's expected feature names" - ) - dataset.columns = expected_features - # Dropping label columns if necessary: if not label_columns: label_columns = [] diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index 1f735026c..262f5e9f2 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,4 @@ pandas scikit-learn~=1.5.2 -xgboost<2.0.0 +lightgbm plotly diff --git a/functions/src/auto_trainer/test_auto_trainer.py b/functions/src/auto_trainer/test_auto_trainer.py index 2874f3368..fe5c051a4 100644 --- a/functions/src/auto_trainer/test_auto_trainer.py +++ b/functions/src/auto_trainer/test_auto_trainer.py @@ -28,7 +28,7 @@ MODELS = [ ("sklearn.linear_model.LinearRegression", "regression"), ("sklearn.ensemble.RandomForestClassifier", "classification"), - ("xgboost.XGBRegressor", "regression"), + ("lightgbm.LGBMRegressor", "regression"), ] REQUIRED_ENV_VARS = [ diff --git a/functions/src/sklearn_classifier/function.yaml b/functions/src/sklearn_classifier/function.yaml index 208497ecc..205df697d 100644 --- a/functions/src/sklearn_classifier/function.yaml +++ b/functions/src/sklearn_classifier/function.yaml @@ -1,98 +1,10 @@ -kind: job spec: - default_handler: train_model - command: '' image: mlrun/mlrun + description: train any classifier using scikit-learn's API + default_handler: train_model entry_points: - get_sample: - has_kwargs: false - has_varargs: false - lineno: 33 - parameters: - - name: dataset - type: DataItem - doc: DataItem containing the dataset - - name: sample - type: int - doc: Number of samples to take. If -1, use all. If < -1, take random sample. - - name: label_column - type: str - doc: Name of the label column - outputs: - - type: Tuple[pd.DataFrame, pd.Series, list] - name: get_sample - doc: Get a sample of the dataset with labels separated. - get_splits: - has_kwargs: false - has_varargs: false - lineno: 56 - parameters: - - name: features - type: DataFrame - doc: Feature DataFrame - - name: labels - type: Series - doc: Labels Series - - name: num_splits - type: int - doc: Number of splits (3 for train/val/test) - - name: test_size - type: float - doc: Proportion for test set - - name: val_size - type: float - doc: Proportion of remaining data for validation - - name: random_state - type: int - doc: Random seed - default: 1 - outputs: - - type: List[Tuple[pd.DataFrame, pd.Series]] - name: get_splits - doc: Split data into train, validation, and test sets. - gen_sklearn_model: - has_kwargs: false - has_varargs: false - lineno: 86 - parameters: - - name: model_pkg_class - type: str - doc: Full class path (e.g., "sklearn.ensemble.RandomForestClassifier") - - name: parameters - type: list - doc: List of (key, value) parameter tuples - outputs: - - type: dict - name: gen_sklearn_model - doc: Generate sklearn model configuration from class name and parameters. - eval_model_v2: - has_kwargs: false - has_varargs: false - lineno: 117 - parameters: - - name: context - type: MLClientCtx - doc: MLRun context - - name: xvalid - type: DataFrame - doc: Validation features - - name: yvalid - type: Series - doc: Validation labels - - name: model - doc: Trained sklearn model - - name: plots_artifact_path - type: str - doc: Path for plots (not used in this simplified version) - default: null - outputs: - - type: dict - name: eval_model_v2 - doc: Evaluate a sklearn classifier model. train_model: - has_kwargs: false has_varargs: false - lineno: 148 parameters: - name: context type: MLClientCtx @@ -116,7 +28,7 @@ spec: type: int doc: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample - default: + default: - name: test_size type: float doc: (0.05) test set size @@ -152,8 +64,6 @@ spec: type: int doc: (1) sklearn rng seed default: 1 - outputs: - - type: None name: train_model doc: 'train a classifier @@ -166,16 +76,21 @@ spec: scalar "results", a "plots" keys with a list of PlotArtifacts, and and "tables" key containing a returned list of TableArtifacts.' + outputs: + - type: None + lineno: 32 + has_kwargs: false + disable_auto_mount: false build: - functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Generated by nuclio.export.NuclioExporter

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)


from cloudpickle import dumps
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.utils.helpers import create_class


def get_sample(dataset: DataItem, sample: int, label_column: str) -> Tuple[pd.DataFrame, pd.Series, list]:
    """Get a sample of the dataset with labels separated.

    :param dataset: DataItem containing the dataset
    :param sample: Number of samples to take. If -1, use all. If < -1, take random sample.
    :param label_column: Name of the label column
    """
    df = dataset.as_df()

    if sample == -1:
        sampled_df = df
    elif sample < -1:
        sampled_df = df.sample(n=abs(sample), random_state=1)
    else:
        sampled_df = df.head(sample)

    labels = sampled_df[label_column]
    features = sampled_df.drop(label_column, axis=1)
    header = list(features.columns)

    return features, labels, header


def get_splits(
    features: pd.DataFrame,
    labels: pd.Series,
    num_splits: int,
    test_size: float,
    val_size: float,
    random_state: int = 1
) -> List[Tuple[pd.DataFrame, pd.Series]]:
    """Split data into train, validation, and test sets.

    :param features: Feature DataFrame
    :param labels: Labels Series
    :param num_splits: Number of splits (3 for train/val/test)
    :param test_size: Proportion for test set
    :param val_size: Proportion of remaining data for validation
    :param random_state: Random seed
    """
    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        features, labels, test_size=test_size, random_state=random_state
    )

    # Second split: separate train and validation from remaining data
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size, random_state=random_state
    )

    return [(X_train, y_train), (X_val, y_val), (X_test, y_test)]


def gen_sklearn_model(model_pkg_class: str, parameters: list) -> dict:
    """Generate sklearn model configuration from class name and parameters.

    :param model_pkg_class: Full class path (e.g., "sklearn.ensemble.RandomForestClassifier")
    :param parameters: List of (key, value) parameter tuples
    """
    config = {
        "META": {"class": model_pkg_class},
        "CLASS": {},
        "FIT": {}
    }

    # Parameters that should not be passed to sklearn model
    excluded_params = {
        'model_pkg_class', 'dataset', 'label_column', 'encode_cols',
        'sample', 'test_size', 'train_val_split', 'test_set_key',
        'model_evaluator', 'models_dest', 'plots_dest', 'file_ext',
        'model_pkg_file', 'context'
    }

    # Separate parameters into model init params and fit params
    for key, value in parameters:
        if key in ['X', 'y', 'sample_weight']:
            config["FIT"][key] = value
        elif key not in excluded_params:
            # Only add parameters that are not function-specific
            config["CLASS"][key] = value

    return config


def eval_model_v2(
    context: MLClientCtx,
    xvalid: pd.DataFrame,
    yvalid: pd.Series,
    model,
    plots_artifact_path: str = None
) -> dict:
    """Evaluate a sklearn classifier model.

    :param context: MLRun context
    :param xvalid: Validation features
    :param yvalid: Validation labels
    :param model: Trained sklearn model
    :param plots_artifact_path: Path for plots (not used in this simplified version)
    """
    y_pred = model.predict(xvalid)

    metrics = {
        "accuracy": accuracy_score(yvalid, y_pred),
        "precision": precision_score(yvalid, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(yvalid, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(yvalid, y_pred, average='weighted', zero_division=0)
    }

    # Log metrics to context
    for key, value in metrics.items():
        context.log_result(key, value)

    return {}


def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    encode_cols: List[str] = [],
    sample: int = -1,
    test_size: float = 0.30,
    train_val_split: float = 0.70,
    test_set_key: str = "test_set",
    model_evaluator=None,
    models_dest: str = "",
    plots_dest: str = "plots",
    file_ext: str = "parquet",
    model_pkg_file: str = "",
    random_state: int = 1,
) -> None:
    """train a classifier

    An optional cutom model evaluator can be supplied that should have the signature:
    `my_custom_evaluator(context, xvalid, yvalid, model)` and return a dictionary of
    scalar "results", a "plots" keys with a list of PlotArtifacts, and
    and "tables" key containing a returned list of TableArtifacts.

    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier",
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param test_set_key:      key of held out data in artifact store
    :param model_evaluator:   (None) a custom model evaluator can be specified
    :param models_dest:       ("") models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param file_ext:          ("parquet") format for test_set_key hold out data
    :param random_state:      (1) sklearn rng seed

    """
    models_dest = models_dest or "model"

    raw, labels, header = get_sample(dataset, sample, label_column)

    if encode_cols:
        raw = pd.get_dummies(
            raw,
            columns=list(encode_cols.keys()),
            prefix=list(encode_cols.values()),
            drop_first=True,
        )

    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits(
        raw, labels, 3, test_size, 1 - train_val_split, random_state
    )

    test_set = pd.concat([xtest, ytest.to_frame()], axis=1)
    context.log_dataset(
        test_set_key,
        df=test_set,
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath("data"),
    )

    model_config = gen_sklearn_model(model_pkg_class, context.parameters.items())

    model_config["FIT"].update({"X": xtrain, "y": ytrain.values})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)
    plots_path = context.artifact_subpath(models_dest, plots_dest)
    if model_evaluator:
        eval_metrics = model_evaluator(
            context, xvalid, yvalid, model, plots_artifact_path=plots_path
        )
    else:
        eval_metrics = eval_model_v2(
            context, xvalid, yvalid, model, plots_artifact_path=plots_path
        )

    kwargs = {"training_set": test_set, "label_column": label_column}
    split = model_pkg_class.rsplit(".", 1)
    if split and len(split) == 2:
        kwargs["algorithm"] = split[1]

    if dataset.meta and dataset.meta.kind == "FeatureVector":
        kwargs["feature_vector"] = dataset.meta.uri

    context.set_label("class", model_pkg_class)
    context.log_model(
        "model",
        body=dumps(model),
        artifact_path=artifact_path,
        extra_data=eval_metrics,
        model_file="model.pkl",
        metrics=context.results,
        labels={"class": model_pkg_class},
        framework="sklearn",
        **kwargs
    )
 + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKIyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IHdhcm5pbmdzCgp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSJpZ25vcmUiLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKCmZyb20gY2xvdWRwaWNrbGUgaW1wb3J0IGR1bXBzCmltcG9ydCBwYW5kYXMgYXMgcGQKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLm1sdXRpbHMuZGF0YSBpbXBvcnQgZ2V0X3NhbXBsZSwgZ2V0X3NwbGl0cwpmcm9tIG1scnVuLm1sdXRpbHMubW9kZWxzIGltcG9ydCBnZW5fc2tsZWFybl9tb2RlbCwgZXZhbF9tb2RlbF92Mgpmcm9tIG1scnVuLnV0aWxzLmhlbHBlcnMgaW1wb3J0IGNyZWF0ZV9jbGFzcwoKCmRlZiB0cmFpbl9tb2RlbCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgbW9kZWxfcGtnX2NsYXNzOiBzdHIsCiAgICBkYXRhc2V0OiBEYXRhSXRlbSwKICAgIGxhYmVsX2NvbHVtbjogc3RyID0gImxhYmVscyIsCiAgICBlbmNvZGVfY29sczogTGlzdFtzdHJdID0gW10sCiAgICBzYW1wbGU6IGludCA9IC0xLAogICAgdGVzdF9zaXplOiBmbG9hdCA9IDAuMzAsCiAgICB0cmFpbl92YWxfc3BsaXQ6IGZsb2F0ID0gMC43MCwKICAgIHRlc3Rfc2V0X2tleTogc3RyID0gInRlc3Rfc2V0IiwKICAgIG1vZGVsX2V2YWx1YXRvcj1Ob25lLAogICAgbW9kZWxzX2Rlc3Q6IHN0ciA9ICIiLAogICAgcGxvdHNfZGVzdDogc3RyID0gInBsb3RzIiwKICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICBtb2RlbF9wa2dfZmlsZTogc3RyID0gIiIsCiAgICByYW5kb21fc3RhdGU6IGludCA9IDEsCikgLT4gTm9uZToKICAgICIiInRyYWluIGEgY2xhc3NpZmllcgoKICAgIEFuIG9wdGlvbmFsIGN1dG9tIG1vZGVsIGV2YWx1YXRvciBjYW4gYmUgc3VwcGxpZWQgdGhhdCBzaG91bGQgaGF2ZSB0aGUgc2lnbmF0dXJlOgogICAgYG15X2N1c3RvbV9ldmFsdWF0b3IoY29udGV4dCwgeHZhbGlkLCB5dmFsaWQsIG1vZGVsKWAgYW5kIHJldHVybiBhIGRpY3Rpb25hcnkgb2YKICAgIHNjYWxhciAicmVzdWx0cyIsIGEgInBsb3RzIiBrZXlzIHdpdGggYSBsaXN0IG9mIFBsb3RBcnRpZmFjdHMsIGFuZAogICAgYW5kICJ0YWJsZXMiIGtleSBjb250YWluaW5nIGEgcmV0dXJuZWQgbGlzdCBvZiBUYWJsZUFydGlmYWN0cy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gbW9kZWxfcGtnX2NsYXNzOiAgIHRoZSBtb2RlbCB0byB0cmFpbiwgZS5nLCAic2tsZWFybi5uZXVyYWxfbmV0d29ya3MuTUxQQ2xhc3NpZmllciIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIG9yIGpzb24gbW9kZWwgY29uZmlnCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgICAgICgiZGF0YSIpIG5hbWUgb2YgcmF3IGRhdGEgZmlsZQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogICAgICBncm91bmQtdHJ1dGggKHkpIGxhYmVscwogICAgOnBhcmFtIGVuY29kZV9jb2xzOiAgICAgICBkaWN0aW9uYXJ5IG9mIG5hbWVzIGFuZCBwcmVmaXhlcyBmb3IgY29sdW1ucyB0aGF0IGFyZQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0byBob3QgYmUgZW5jb2RlZC4KICAgIDpwYXJhbSBzYW1wbGU6ICAgICAgICAgICAgU2VsZWN0cyB0aGUgZmlyc3QgbiByb3dzLCBvciBzZWxlY3QgYSBzYW1wbGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc3RhcnRpbmcgZnJvbSB0aGUgZmlyc3QuIElmIG5lZ2F0aXZlIDwtMSwgc2VsZWN0CiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGEgcmFuZG9tIHNhbXBsZQogICAgOnBhcmFtIHRlc3Rfc2l6ZTogICAgICAgICAoMC4wNSkgdGVzdCBzZXQgc2l6ZQogICAgOnBhcmFtIHRyYWluX3ZhbF9zcGxpdDogICAoMC43NSkgT25jZSB0aGUgdGVzdCBzZXQgaGFzIGJlZW4gcmVtb3ZlZCB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHJhaW5pbmcgc2V0IGdldHMgdGhpcyBwcm9wb3J0aW9uLgogICAgOnBhcmFtIHRlc3Rfc2V0X2tleTogICAgICBrZXkgb2YgaGVsZCBvdXQgZGF0YSBpbiBhcnRpZmFjdCBzdG9yZQogICAgOnBhcmFtIG1vZGVsX2V2YWx1YXRvcjogICAoTm9uZSkgYSBjdXN0b20gbW9kZWwgZXZhbHVhdG9yIGNhbiBiZSBzcGVjaWZpZWQKICAgIDpwYXJhbSBtb2RlbHNfZGVzdDogICAgICAgKCIiKSBtb2RlbHMgc3ViZm9sZGVyIG9uIGFydGlmYWN0IHBhdGgKICAgIDpwYXJhbSBwbG90c19kZXN0OiAgICAgICAgcGxvdCBzdWJmb2xkZXIgb24gYXJ0aWZhY3QgcGF0aAogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAgICAoInBhcnF1ZXQiKSBmb3JtYXQgZm9yIHRlc3Rfc2V0X2tleSBob2xkIG91dCBkYXRhCiAgICA6cGFyYW0gcmFuZG9tX3N0YXRlOiAgICAgICgxKSBza2xlYXJuIHJuZyBzZWVkCgogICAgIiIiCiAgICBtb2RlbHNfZGVzdCA9IG1vZGVsc19kZXN0IG9yICJtb2RlbCIKCiAgICByYXcsIGxhYmVscywgaGVhZGVyID0gZ2V0X3NhbXBsZShkYXRhc2V0LCBzYW1wbGUsIGxhYmVsX2NvbHVtbikKCiAgICBpZiBlbmNvZGVfY29sczoKICAgICAgICByYXcgPSBwZC5nZXRfZHVtbWllcygKICAgICAgICAgICAgcmF3LAogICAgICAgICAgICBjb2x1bW5zPWxpc3QoZW5jb2RlX2NvbHMua2V5cygpKSwKICAgICAgICAgICAgcHJlZml4PWxpc3QoZW5jb2RlX2NvbHMudmFsdWVzKCkpLAogICAgICAgICAgICBkcm9wX2ZpcnN0PVRydWUsCiAgICAgICAgKQoKICAgICh4dHJhaW4sIHl0cmFpbiksICh4dmFsaWQsIHl2YWxpZCksICh4dGVzdCwgeXRlc3QpID0gZ2V0X3NwbGl0cygKICAgICAgICByYXcsIGxhYmVscywgMywgdGVzdF9zaXplLCAxIC0gdHJhaW5fdmFsX3NwbGl0LCByYW5kb21fc3RhdGUKICAgICkKCiAgICB0ZXN0X3NldCA9IHBkLmNvbmNhdChbeHRlc3QsIHl0ZXN0LnRvX2ZyYW1lKCldLCBheGlzPTEpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KAogICAgICAgIHRlc3Rfc2V0X2tleSwKICAgICAgICBkZj10ZXN0X3NldCwKICAgICAgICBmb3JtYXQ9ZmlsZV9leHQsCiAgICAgICAgaW5kZXg9RmFsc2UsCiAgICAgICAgbGFiZWxzPXsiZGF0YS10eXBlIjogImhlbGQtb3V0In0sCiAgICAgICAgYXJ0aWZhY3RfcGF0aD1jb250ZXh0LmFydGlmYWN0X3N1YnBhdGgoImRhdGEiKSwKICAgICkKCiAgICBtb2RlbF9jb25maWcgPSBnZW5fc2tsZWFybl9tb2RlbChtb2RlbF9wa2dfY2xhc3MsIGNvbnRleHQucGFyYW1ldGVycy5pdGVtcygpKQoKICAgIG1vZGVsX2NvbmZpZ1siRklUIl0udXBkYXRlKHsiWCI6IHh0cmFpbiwgInkiOiB5dHJhaW4udmFsdWVzfSkKCiAgICBDbGFzc2lmaWVyQ2xhc3MgPSBjcmVhdGVfY2xhc3MobW9kZWxfY29uZmlnWyJNRVRBIl1bImNsYXNzIl0pCgogICAgbW9kZWwgPSBDbGFzc2lmaWVyQ2xhc3MoKiptb2RlbF9jb25maWdbIkNMQVNTIl0pCgogICAgbW9kZWwuZml0KCoqbW9kZWxfY29uZmlnWyJGSVQiXSkKCiAgICBhcnRpZmFjdF9wYXRoID0gY29udGV4dC5hcnRpZmFjdF9zdWJwYXRoKG1vZGVsc19kZXN0KQogICAgcGxvdHNfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aChtb2RlbHNfZGVzdCwgcGxvdHNfZGVzdCkKICAgIGlmIG1vZGVsX2V2YWx1YXRvcjoKICAgICAgICBldmFsX21ldHJpY3MgPSBtb2RlbF9ldmFsdWF0b3IoCiAgICAgICAgICAgIGNvbnRleHQsIHh2YWxpZCwgeXZhbGlkLCBtb2RlbCwgcGxvdHNfYXJ0aWZhY3RfcGF0aD1wbG90c19wYXRoCiAgICAgICAgKQogICAgZWxzZToKICAgICAgICBldmFsX21ldHJpY3MgPSBldmFsX21vZGVsX3YyKAogICAgICAgICAgICBjb250ZXh0LCB4dmFsaWQsIHl2YWxpZCwgbW9kZWwsIHBsb3RzX2FydGlmYWN0X3BhdGg9cGxvdHNfcGF0aAogICAgICAgICkKCiAgICBrd2FyZ3MgPSB7InRyYWluaW5nX3NldCI6IHRlc3Rfc2V0LCAibGFiZWxfY29sdW1uIjogbGFiZWxfY29sdW1ufQogICAgc3BsaXQgPSBtb2RlbF9wa2dfY2xhc3MucnNwbGl0KCIuIiwgMSkKICAgIGlmIHNwbGl0IGFuZCBsZW4oc3BsaXQpID09IDI6CiAgICAgICAga3dhcmdzWyJhbGdvcml0aG0iXSA9IHNwbGl0WzFdCgogICAgaWYgZGF0YXNldC5tZXRhIGFuZCBkYXRhc2V0Lm1ldGEua2luZCA9PSAiRmVhdHVyZVZlY3RvciI6CiAgICAgICAga3dhcmdzWyJmZWF0dXJlX3ZlY3RvciJdID0gZGF0YXNldC5tZXRhLnVyaQoKICAgIGNvbnRleHQuc2V0X2xhYmVsKCJjbGFzcyIsIG1vZGVsX3BrZ19jbGFzcykKICAgIGNvbnRleHQubG9nX21vZGVsKAogICAgICAgICJtb2RlbCIsCiAgICAgICAgYm9keT1kdW1wcyhtb2RlbCksCiAgICAgICAgYXJ0aWZhY3RfcGF0aD1hcnRpZmFjdF9wYXRoLAogICAgICAgIGV4dHJhX2RhdGE9ZXZhbF9tZXRyaWNzLAogICAgICAgIG1vZGVsX2ZpbGU9Im1vZGVsLnBrbCIsCiAgICAgICAgbWV0cmljcz1jb250ZXh0LnJlc3VsdHMsCiAgICAgICAgbGFiZWxzPXsiY2xhc3MiOiBtb2RlbF9wa2dfY2xhc3N9LAogICAgICAgIGZyYW1ld29yaz0ic2tsZWFybiIsCiAgICAgICAgKiprd2FyZ3MKICAgICkK origin_filename: '' code_origin: '' - disable_auto_mount: false - description: train any classifier using scikit-learn's API + command: '' metadata: + tag: '' name: sklearn-classifier categories: - machine-learning - model-training - tag: '' verbose: false +kind: job diff --git a/functions/src/sklearn_classifier/item.yaml b/functions/src/sklearn_classifier/item.yaml index 4fa374938..b9726fb79 100644 --- a/functions/src/sklearn_classifier/item.yaml +++ b/functions/src/sklearn_classifier/item.yaml @@ -13,7 +13,7 @@ labels: framework: sklearn maintainers: [] marketplaceType: '' -mlrunVersion: 1.10.0 +mlrunVersion: 1.7.0 name: sklearn-classifier platformVersion: 3.5.3 spec: @@ -23,5 +23,5 @@ spec: kind: job requirements: [] url: '' -version: 1.3.0 +version: 1.2.0 test_valid: false diff --git a/functions/src/sklearn_classifier/requirements.txt b/functions/src/sklearn_classifier/requirements.txt index 113d4a02a..4d9e097f9 100644 --- a/functions/src/sklearn_classifier/requirements.txt +++ b/functions/src/sklearn_classifier/requirements.txt @@ -1,5 +1,5 @@ pandas -scikit-learn~=1.5.2 +scikit-learn==1.0.2 matplotlib seaborn scikit-plot diff --git a/functions/src/sklearn_classifier/sklearn_classifier.py b/functions/src/sklearn_classifier/sklearn_classifier.py index 2fc60a102..1a73d4045 100644 --- a/functions/src/sklearn_classifier/sklearn_classifier.py +++ b/functions/src/sklearn_classifier/sklearn_classifier.py @@ -21,129 +21,14 @@ from cloudpickle import dumps import pandas as pd -from typing import List, Tuple -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score +from typing import List from mlrun.execution import MLClientCtx from mlrun.datastore import DataItem +from mlrun.mlutils.data import get_sample, get_splits +from mlrun.mlutils.models import gen_sklearn_model, eval_model_v2 from mlrun.utils.helpers import create_class -def get_sample(dataset: DataItem, sample: int, label_column: str) -> Tuple[pd.DataFrame, pd.Series, list]: - """Get a sample of the dataset with labels separated. - - :param dataset: DataItem containing the dataset - :param sample: Number of samples to take. If -1, use all. If < -1, take random sample. - :param label_column: Name of the label column - """ - df = dataset.as_df() - - if sample == -1: - sampled_df = df - elif sample < -1: - sampled_df = df.sample(n=abs(sample), random_state=1) - else: - sampled_df = df.head(sample) - - labels = sampled_df[label_column] - features = sampled_df.drop(label_column, axis=1) - header = list(features.columns) - - return features, labels, header - - -def get_splits( - features: pd.DataFrame, - labels: pd.Series, - num_splits: int, - test_size: float, - val_size: float, - random_state: int = 1 -) -> List[Tuple[pd.DataFrame, pd.Series]]: - """Split data into train, validation, and test sets. - - :param features: Feature DataFrame - :param labels: Labels Series - :param num_splits: Number of splits (3 for train/val/test) - :param test_size: Proportion for test set - :param val_size: Proportion of remaining data for validation - :param random_state: Random seed - """ - # First split: separate test set - X_temp, X_test, y_temp, y_test = train_test_split( - features, labels, test_size=test_size, random_state=random_state - ) - - # Second split: separate train and validation from remaining data - X_train, X_val, y_train, y_val = train_test_split( - X_temp, y_temp, test_size=val_size, random_state=random_state - ) - - return [(X_train, y_train), (X_val, y_val), (X_test, y_test)] - - -def gen_sklearn_model(model_pkg_class: str, parameters: list) -> dict: - """Generate sklearn model configuration from class name and parameters. - - :param model_pkg_class: Full class path (e.g., "sklearn.ensemble.RandomForestClassifier") - :param parameters: List of (key, value) parameter tuples - """ - config = { - "META": {"class": model_pkg_class}, - "CLASS": {}, - "FIT": {} - } - - # Parameters that should not be passed to sklearn model - excluded_params = { - 'model_pkg_class', 'dataset', 'label_column', 'encode_cols', - 'sample', 'test_size', 'train_val_split', 'test_set_key', - 'model_evaluator', 'models_dest', 'plots_dest', 'file_ext', - 'model_pkg_file', 'context' - } - - # Separate parameters into model init params and fit params - for key, value in parameters: - if key in ['X', 'y', 'sample_weight']: - config["FIT"][key] = value - elif key not in excluded_params: - # Only add parameters that are not function-specific - config["CLASS"][key] = value - - return config - - -def eval_model_v2( - context: MLClientCtx, - xvalid: pd.DataFrame, - yvalid: pd.Series, - model, - plots_artifact_path: str = None -) -> dict: - """Evaluate a sklearn classifier model. - - :param context: MLRun context - :param xvalid: Validation features - :param yvalid: Validation labels - :param model: Trained sklearn model - :param plots_artifact_path: Path for plots (not used in this simplified version) - """ - y_pred = model.predict(xvalid) - - metrics = { - "accuracy": accuracy_score(yvalid, y_pred), - "precision": precision_score(yvalid, y_pred, average='weighted', zero_division=0), - "recall": recall_score(yvalid, y_pred, average='weighted', zero_division=0), - "f1_score": f1_score(yvalid, y_pred, average='weighted', zero_division=0) - } - - # Log metrics to context - for key, value in metrics.items(): - context.log_result(key, value) - - return {} - - def train_model( context: MLClientCtx, model_pkg_class: str, diff --git a/functions/src/sklearn_classifier/test_sklearn_classifier.py b/functions/src/sklearn_classifier/test_sklearn_classifier.py index 78afd623b..5c29e85b3 100644 --- a/functions/src/sklearn_classifier/test_sklearn_classifier.py +++ b/functions/src/sklearn_classifier/test_sklearn_classifier.py @@ -38,29 +38,19 @@ def test_import_sklearn_classifier(): params = {"model_pkg_class": "sklearn.ensemble.RandomForestClassifier", "label_column": "labels"} - # In local mode, artifacts are in function-name/iteration subdirectory - dataset_path = "./artifacts/gen-class-data-gen-class-data/0/classifier-data.csv" - assert os.path.exists(dataset_path), f"Dataset not found at {dataset_path}" - train_run = fn.run(params=params, - inputs={"dataset": dataset_path}, + inputs={"dataset": acquire_run.status.artifacts[0]['spec']['target_path']}, local=True, - artifact_path="./artifacts") - - # Check that the run completed successfully - assert train_run.status.state == "completed", f"Run failed with state: {train_run.status.state}" - - # In local mode, check if model metrics were logged - assert "accuracy" in train_run.status.results or len(train_run.status.results) > 0, \ - "No metrics were logged" + artifact_path="./") - # In local mode, the model is saved to artifacts/model/function-name/iteration/model/model.pkl - model_path = "./artifacts/model/sklearn-classifier-train-model/0/model/model.pkl" - assert os.path.exists(model_path), f'Could not find model file at {model_path}' + for artifact in train_run.status.artifacts: + if artifact['kind'] == 'model': + assert os.path.exists(artifact['spec']['target_path']), 'Could not find model dir' + break - # Load the model and verify it can make predictions - model = pickle.load(open(model_path, 'rb')) - df = pd.read_csv(dataset_path) + assert os.path.exists(train_run.status.artifacts[0]['spec']['target_path']) + model = pickle.load(open(artifact['spec']['target_path'] + artifact['spec']['model_file'], 'rb')) + df = pd.read_csv(acquire_run.status.artifacts[0]['spec']['target_path']) x = df.drop(['labels'], axis=1).iloc[0:1] y_true = df['labels'][0] y_pred = model.predict_proba(x).argmax() From 2078e86f530dd0b6338da8afa2b1aeb90f2cba02 Mon Sep 17 00:00:00 2001 From: tomerbv Date: Thu, 12 Feb 2026 11:06:09 +0200 Subject: [PATCH 15/15] added xgboost.XGBRegressor, xgboost.XGBClassifier and lightgbm.LGBMClassifier models to test --- functions/src/auto_trainer/requirements.txt | 1 + functions/src/auto_trainer/test_auto_trainer.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/functions/src/auto_trainer/requirements.txt b/functions/src/auto_trainer/requirements.txt index 262f5e9f2..b23f9b9dd 100644 --- a/functions/src/auto_trainer/requirements.txt +++ b/functions/src/auto_trainer/requirements.txt @@ -1,4 +1,5 @@ pandas scikit-learn~=1.5.2 lightgbm +xgboost<2.0.0 plotly diff --git a/functions/src/auto_trainer/test_auto_trainer.py b/functions/src/auto_trainer/test_auto_trainer.py index fe5c051a4..ac95109f8 100644 --- a/functions/src/auto_trainer/test_auto_trainer.py +++ b/functions/src/auto_trainer/test_auto_trainer.py @@ -28,7 +28,10 @@ MODELS = [ ("sklearn.linear_model.LinearRegression", "regression"), ("sklearn.ensemble.RandomForestClassifier", "classification"), + ("xgboost.XGBRegressor", "regression"), + ("xgboost.XGBClassifier", "classification"), ("lightgbm.LGBMRegressor", "regression"), + ("lightgbm.LGBMClassifier", "classification") ] REQUIRED_ENV_VARS = [