From ce5b4370f79e61119b891999d27104e2c44c4233 Mon Sep 17 00:00:00 2001 From: Juan Date: Wed, 10 Dec 2025 21:38:58 +0100 Subject: [PATCH] Remove references to test and training datasets in docstring --- .../CumulativePredictionProbabilities.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py b/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py index a87481d98..27e251c48 100644 --- a/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +++ b/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py @@ -18,40 +18,36 @@ def CumulativePredictionProbabilities( dataset: VMDataset, model: VMModel, title: str = "Cumulative Probabilities" ) -> Tuple[go.Figure, RawData]: """ - Visualizes cumulative probabilities of positive and negative classes for both training and testing in classification models. + Visualizes cumulative probabilities of positive and negative classes in classification models. ### Purpose This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes in a classification model. It provides a visual assessment of the model's behavior by plotting the cumulative - probabilities for positive and negative classes across both the training and test datasets. + probabilities for positive and negative classes within the provided dataset. ### Test Mechanism - The classification model is evaluated by first computing the predicted probabilities for each instance in both - the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities - for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative - distributions of these probabilities are created for both positive and negative classes across both training and - test datasets. These cumulative probabilities are represented visually in a plot, containing two subplots - one for - the training data and the other for the test data, with lines representing cumulative distributions of positive and - negative classes. + The classification model is evaluated by first computing the predicted probabilities for each instance in the + dataset, which are then added as a new column. The cumulative probabilities for positive and negative classes are + subsequently calculated and sorted in ascending order. Cumulative distributions of these probabilities are created + for both positive and negative classes. These cumulative probabilities are represented visually in a plot with lines + representing cumulative distributions of positive and negative classes. ### Signs of High Risk - Imbalanced distribution of probabilities for either positive or negative classes. - - Notable discrepancies or significant differences between the cumulative probability distributions for the - training data versus the test data. - Marked discrepancies or large differences between the cumulative probability distributions for positive and negative classes. + - Unusual patterns in the cumulative probability distributions that may indicate model calibration issues. ### Strengths - Provides a visual illustration of data, which enhances the ease of understanding and interpreting the model's behavior. - - Allows for the comparison of model's behavior across training and testing datasets, providing insights about how - well the model is generalized. - Differentiates between positive and negative classes and their respective distribution patterns, aiding in problem diagnosis. + - Helps identify potential calibration issues by visualizing how probabilities are distributed across classes. ### Limitations @@ -60,8 +56,7 @@ def CumulativePredictionProbabilities( detection. - The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual representation and broad distributional information. - - If the training and test datasets are not representative of the overall data distribution, the metric could - provide misleading results. + - If the dataset is not representative of the overall data distribution, the metric could provide misleading results. """ df = dataset.df