diff --git a/site/notebooks.zip b/site/notebooks.zip index 2dbaa50680..bc59d5fe49 100644 Binary files a/site/notebooks.zip and b/site/notebooks.zip differ diff --git a/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb b/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb index 82e5ec2041..f3b4b064ed 100644 --- a/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb +++ b/site/notebooks/EXECUTED/model_development/2-start_development_process.ipynb @@ -811,22 +811,25 @@ "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import GridSearchCV\n", "\n", - "# Logistic Regression grid params\n", "log_reg_params = {\n", - " \"penalty\": [\"l1\", \"l2\"],\n", + " \"l1_ratio\": [0.0, 1.0], # 0 = L2, 1 = L1\n", " \"C\": [0.001, 0.01, 0.1, 1, 10, 100, 1000],\n", - " \"solver\": [\"liblinear\"],\n", "}\n", "\n", - "# Grid search for Logistic Regression\n", - "from sklearn.model_selection import GridSearchCV\n", + "grid_log_reg = GridSearchCV(\n", + " LogisticRegression(\n", + " solver=\"saga\",\n", + " penalty=\"elasticnet\", # required when using l1_ratio\n", + " max_iter=5000,\n", + " ),\n", + " log_reg_params,\n", + ")\n", "\n", - "grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)\n", "grid_log_reg.fit(X_train, y_train)\n", "\n", - "# Logistic Regression best estimator\n", - "log_reg = grid_log_reg.best_estimator_" + "log_reg = grid_log_reg.best_estimator_\n" ] }, { diff --git a/site/notebooks/EXECUTED/test.ipynb b/site/notebooks/EXECUTED/test.ipynb new file mode 100644 index 0000000000..182f2b0222 --- /dev/null +++ b/site/notebooks/EXECUTED/test.ipynb @@ -0,0 +1,19 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9a8bce10", + "metadata": {}, + "source": [ + "# test changed notebook" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/site/tests/model_validation/statsmodels/CumulativePredictionProbabilities.md b/site/tests/model_validation/statsmodels/CumulativePredictionProbabilities.md index b7522d09c0..9e609c8a4d 100644 --- a/site/tests/model_validation/statsmodels/CumulativePredictionProbabilities.md +++ b/site/tests/model_validation/statsmodels/CumulativePredictionProbabilities.md @@ -1,39 +1,35 @@ # CumulativePredictionProbabilities -Visualizes cumulative probabilities of positive and negative classes for both training and testing in classification models. +Visualizes cumulative probabilities of positive and negative classes in classification models. ### Purpose This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes in a classification model. It provides a visual assessment of the model's behavior by plotting the cumulative -probabilities for positive and negative classes across both the training and test datasets. +probabilities for positive and negative classes within the provided dataset. ### Test Mechanism -The classification model is evaluated by first computing the predicted probabilities for each instance in both -the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities -for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative -distributions of these probabilities are created for both positive and negative classes across both training and -test datasets. These cumulative probabilities are represented visually in a plot, containing two subplots - one for -the training data and the other for the test data, with lines representing cumulative distributions of positive and -negative classes. +The classification model is evaluated by first computing the predicted probabilities for each instance in the +dataset, which are then added as a new column. The cumulative probabilities for positive and negative classes are +subsequently calculated and sorted in ascending order. Cumulative distributions of these probabilities are created +for both positive and negative classes. These cumulative probabilities are represented visually in a plot with lines +representing cumulative distributions of positive and negative classes. ### Signs of High Risk - Imbalanced distribution of probabilities for either positive or negative classes. -- Notable discrepancies or significant differences between the cumulative probability distributions for the -training data versus the test data. - Marked discrepancies or large differences between the cumulative probability distributions for positive and negative classes. +- Unusual patterns in the cumulative probability distributions that may indicate model calibration issues. ### Strengths - Provides a visual illustration of data, which enhances the ease of understanding and interpreting the model's behavior. -- Allows for the comparison of model's behavior across training and testing datasets, providing insights about how -well the model is generalized. - Differentiates between positive and negative classes and their respective distribution patterns, aiding in problem diagnosis. +- Helps identify potential calibration issues by visualizing how probabilities are distributed across classes. ### Limitations @@ -42,5 +38,4 @@ problem diagnosis. detection. - The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual representation and broad distributional information. -- If the training and test datasets are not representative of the overall data distribution, the metric could -provide misleading results. \ No newline at end of file +- If the dataset is not representative of the overall data distribution, the metric could provide misleading results. \ No newline at end of file