diff --git a/R/app_server.R b/R/app_server.R index 6b3461a..4915a4b 100644 --- a/R/app_server.R +++ b/R/app_server.R @@ -1,7 +1,7 @@ ########################################################## # RTutor.AI | A Shiny app for chatting with your data. # Author: Xijin Ge | ge@orditus.com -# © 2024 Orditus LLC +# © 2025 Orditus LLC # No warranty & not for commercial use without a license. ########################################################## diff --git a/R/app_ui.R b/R/app_ui.R index 3c92ecd..c4107a2 100644 --- a/R/app_ui.R +++ b/R/app_ui.R @@ -1,7 +1,7 @@ ########################################################## # RTutor.AI | A Shiny app for chatting with your data. # Author: Xijin Ge | ge@orditus.com -# © 2024 Orditus LLC +# © 2025 Orditus LLC # No warranty & not for commercial use without a license. ########################################################## @@ -25,7 +25,7 @@ app_ui <- function(request) { tags$footer( style = "position: fixed;bottom: 0;width: 100%;background-color: #F6FFF5; padding: 10px;text-align: center;z-index: 99;", - span("© 2024 Orditus LLC"), + span("© 2025 Orditus LLC"), HTML(" "), # Adds space actionLink(inputId = "ppolicy", "Privacy Policy"), HTML(" "), diff --git a/R/mod_10_eda.R b/R/mod_10_eda.R index 58499b8..ba76dd6 100644 --- a/R/mod_10_eda.R +++ b/R/mod_10_eda.R @@ -135,6 +135,16 @@ mod_10_eda_ui <- function(id) { ), uiOutput(ns("eda_report_ui")) ) + ), + tabPanel( + title = "Neural Net Reports", + div(style = "margin-left: 20px;", + hr(class = "custom-hr-thick"), + h4(strong("Neural Net Analysis"), + style = "font-size: 24px;" + ), + uiOutput(ns("nnet_report_ui")) + ) ) ) ) @@ -876,5 +886,228 @@ mod_10_eda_serv <- function(id, selected_dataset_name, use_python, } ) + + ##### RTutor NNet Report ##### + # UI for Neural Network Tab + output$nnet_report_ui <- renderUI({ + req(selected_dataset_name() != no_data) + req(!use_python()) + req(!is.null(current_data())) + df <- ggpairs_data() + + # Calculate the number of levels for the selected target variable + # (required info for binary classification) + num_levels <- 0 + target_levels <- character(0) + if (!is.null(input$nnet_target_variable) && input$nnet_target_variable %in% colnames(df)) { + selected_col <- df[[input$nnet_target_variable]] + + if (is.factor(selected_col)) { # if target is a factor + num_levels <- length(levels(selected_col)) # number of levels + if (num_levels == 2) { # if target is a binary factor + target_levels <- levels(selected_col) # vector of target levels + } + } + } + + tagList( + br(), + fluidRow( + column( + width = 3, + actionButton( + inputId = ns("render_nnet_report_rtutor"), + label = strong("Render Report"), + class = "custom-action-button" + ) + ) + ), + br(), + fluidRow( + column( + width = 6, + selectInput( + inputId = ns("nnet_target_variable"), + label = "Select a target variable:", + choices = colnames(df), + selected = input$nnet_target_variable, + multiple = FALSE + ) + ), + if (num_levels == 2) { # if target variable is binary factor + column( + width = 6, + selectInput( + inputId = ns("nnet_positive_class"), + label = "Select the positive class:", + choices = target_levels, + selected = if (length(target_levels) > 0) target_levels[1] else NULL # Default to the first level, or handle empty list + ) + ) + } + ), + br(), + checkboxGroupInput( + inputId = ns("nnet_variables"), + label = "Select up to 20 predictor variables:", + choices = colnames(df), + selected = colnames(df) + ), + br(), + selectInput( + inputId = ns("nnet_size_value"), + label = "Select number of neurons:", + choices = c(2:30), + selected = 6, + multiple = FALSE + ) + ) + }) + + # when user uploads a file and has more than 20 columns, only the first 20 is selected by nnet_variables. + observeEvent(input$user_file, { + req(selected_dataset_name() != no_data) + req(input$select_data == uploaded_data) + req(!use_python()) + req(!is.null(ggpairs_data())) + df <- ggpairs_data() + if(ncol(df) > max_eda_var) { + updateCheckboxGroupInput( + session = session, + inputId = ns("nnet_variables"), + label = "Deselect variables to ignore (optional):", + choices = colnames(df), + selected = colnames(df)[1:max_eda_var] + ) + } + }) + + # if user selects more than 20 columns for the nnet_variables, only the first 20 is selected by nnet_variables. Show a warning. + observeEvent(c(input$nnet_variables, input$nnet_target_variable), { + req(!use_python()) + req(!is.null(ggpairs_data())) + + selected_var <- input$nnet_variables + update_selection <- FALSE + # if the selected target variable is not included in the nnet_variables, add it to the top of the list. + if (!(input$nnet_target_variable %in% selected_var)) { + selected_var <- c(input$nnet_target_variable, selected_var) + update_selection <- TRUE + } + + if(length(selected_var) > max_eda_var) { + selected_var <- selected_var[1:max_eda_var] + + showNotification( + ui = paste("Only the first 20 variables are selected for EDA. + Please deselect some variables to continue."), + id = "nnet_variables_warning", + duration = 5, + type = "error" + ) + update_selection <- TRUE + } + + # if target variable is selected, add to it; if too many, only keep the first 20 + if(update_selection){ + updateCheckboxGroupInput( + session = session, + inputId = ns("nnet_variables"), + label = "Deselect variables to ignore (optional):", + choices = colnames(ggpairs_data()), + selected = selected_var + ) + } + }) + + nnet_file <- reactiveVal(NULL) + + observeEvent(input$render_nnet_report_rtutor, { + req(selected_dataset_name() != no_data) + req(!use_python()) + req(!is.null(current_data())) + + + withProgress(message = "Generating Report (5 minutes)", { + incProgress(0.2) + # Copy the report file to a temporary directory before processing it, in + # case we don't have write permissions to the current working dir (which + # can happen when deployed). + tempReport <- file.path(tempdir(), "RTutor_NNet.Rmd") + # tempReport + tempReport <- gsub("\\", "/", tempReport, fixed = TRUE) + output_file <- gsub("Rmd$", "html", tempReport) + # This should retrieve the project location on your device: + # "C:/Users/bdere/Documents/GitHub/idepGolem" + #wd <- getwd() + + markdown_location <- app_sys("app/www/nnet.Rmd") + file.copy(from = markdown_location, to = tempReport, overwrite = TRUE) + + + # Extract negative class for binary classification + negative_class <- if (!is.null(input$nnet_positive_class)) { + setdiff(levels(ggpairs_data()[[input$nnet_target_variable]]), input$nnet_positive_class) + } else { + NULL + } + + # Set up parameters to pass to Rmd document + params <- list( + df = ggpairs_data()[, input$nnet_variables], + target = input$nnet_target_variable, + predictors = input$nnet_variables, + size = input$nnet_size_value, + positive_class_variable = input$nnet_positive_class, + negative_class_variable = negative_class + ) + req(params) + # Knit the document, passing in the `params` list, and eval it in a + # child of the global environment (this isolates the code in the document + # from the code in this app). + tryCatch({ + rmarkdown::render( + input = tempReport, # markdown_location, + output_file = output_file, + params = params, + envir = new.env(parent = globalenv()) + ) + }, + error = function(e) { + showNotification( + ui = paste("Error when generating the report. Please try again."), + id = "nnet_report_error", + duration = 5, + type = "error" + ) + }, + finally = { + nnet_file(output_file) + # show modal with download button + showModal(modalDialog( + title = "Successfully rendered the report!", + downloadButton( + outputId = ns("nnet_report_rtutor"), + label = "Download" + ), + easyClose = TRUE + )) + } + ) + }) + }) + + # Markdown report + output$nnet_report_rtutor <- downloadHandler( + # For PDF output, change this to "report.pdf" + filename = "RTutor_NNet.html", + content = function(file) { + validate( + need(!is.null(nnet_file()), "File not found.") + ) + file.copy(from = nnet_file(), to = file, overwrite = TRUE) + } + ) + }) } \ No newline at end of file diff --git a/R/mod_12_about.R b/R/mod_12_about.R index dea3c7b..b7db99a 100644 --- a/R/mod_12_about.R +++ b/R/mod_12_about.R @@ -104,8 +104,8 @@ mod_12_about_ui <- function(id) { ), p( "Explore our other AI tools at ", - a("Chatlize.ai", - href = "https://chatlize.ai/", + a("Datably.ai", + href = "https://datably.ai/", target = "_blank" ), style = "font-size: 23px;padding-left: 20px;padding-right: 20px;" diff --git a/R/mod_14_first_time_user_ui.r b/R/mod_14_first_time_user_ui.r index d5f68c5..2930490 100644 --- a/R/mod_14_first_time_user_ui.r +++ b/R/mod_14_first_time_user_ui.r @@ -43,8 +43,8 @@ mod_14_first_time_user_ui <- function(id) { ), h4("Also try ", a( - "Chatlize.ai,", - href = "https://chatlize.ai", + "Datably.ai,", + href = "https://datably.ai", target = "_blank" ), " a more general platform for analyzing data through chats. @@ -97,7 +97,7 @@ mod_14_first_time_user_ui <- function(id) { tags$li( "RTutor can only analyze traditional statistics data, where rows are observations and columns are variables. For complex - data, try https://chatlize.ai." + data, try https://datably.ai." ), tags$li( "Once uploaded, your data is automatically loaded into R as a diff --git a/inst/app/www/nnet.Rmd b/inst/app/www/nnet.Rmd new file mode 100644 index 0000000..0659267 --- /dev/null +++ b/inst/app/www/nnet.Rmd @@ -0,0 +1,450 @@ +--- +title: "Neural net analysis, automated" +author: "Generated by RTutor.ai" +date: "`r Sys.Date()`" +output: + html_document: + number_sections: true + code_folding: hide + toc: true +params: + df: NULL + target: NULL + predictors: NULL + size: 5 + decay: 0.1 + positive_class_variable: NULL + negative_class_variable: NULL + cv_folds: 5 + date: + label: "Date: " + value: !r Sys.Date() + printcode: + label: "Display Code" + value: TRUE + input: checkbox +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = params$printcode, message = FALSE, warning = FALSE) + +suppressPackageStartupMessages({ + library(ggplot2) + library(dplyr) + library(tidyr) + library(nnet) + # caret/pROC are optional; we check availability later +}) + +# Pull params +df <- params$df +target <- params$target +predictors <- params$predictors +size <- as.numeric(params$size) +decay <- as.numeric(params$decay) +pos_lab <- params$positive_class_variable +neg_lab <- params$negative_class_variable +cv_folds <- as.integer(params$cv_folds) + +stopifnot(!is.null(df), !is.null(target), !is.null(predictors)) +predictors <- setdiff(predictors, target) + +set.seed(123) +``` + +# Parameter Glossary (and values used) + +Below are the parameters the report accepts and what they are. + +- **`df`** *(data.frame)*: Your input dataset (required). +- **`target`** *(string)*: Name of the outcome/response column in `df` (required). +- **`predictors`** *(character vector)*: Names of input features (required). +- **`size`** *(integer; default = `r params$size`)*: Number of hidden units in the single hidden layer. Larger values increase flexibility but can overfit. +- **`decay`** *(numeric; default = `r params$decay`)*: L2 weight penalty that shrinks weights and helps prevent overfitting. +- **`positive_class_variable`** *(string; optional)*: For **binary classification**, which class label should be treated as “positive” for ROC/AUC and thresholding. +- **`negative_class_variable`** *(string; optional)*: The other class in binary classification. Only used when explicit labeling is needed. +- **`cv_folds`** *(integer; default = `r params$cv_folds`)*: Number of cross-validation folds (if `caret` is installed). Reduces variance of metric estimates. + +```{r show-params, echo=FALSE} +param_tbl <- tibble::tibble( + parameter = c("df", "target", "predictors", "size", "decay", "positive_class_variable", "negative_class_variable", "cv_folds"), + value = c( + paste0("data.frame with ", nrow(df), " rows × ", ncol(df), " cols"), + as.character(target), + paste(predictors, collapse = ", "), + as.character(size), + as.character(decay), + ifelse(is.null(pos_lab), "NULL", as.character(pos_lab)), + ifelse(is.null(neg_lab), "NULL", as.character(neg_lab)), + as.character(cv_folds) + ) +) +knitr::kable(param_tbl, caption = "Parameters in effect for this run") +``` + +# Data Summary +We will complete a neural network analysis using the nnet package. We will evaluate models for +both classification and regression tasks, depending on the nature of the target variable. + +```{r data-summary} +cat("Head of dataframe:") +head(df) # Display the first few rows of the dataframe +cat("Summary of dataframe:") +summary(df) # Summary statistics +``` + + +# Data Pre-process + +We apply a few generic rules for pre-processing: + +1. **Remove likely ID columns**: character/factor columns where unique values > 80% of rows. +2. **Coerce low-cardinality numerics to factor**: numeric with <5% unique and <13 unique values. +3. **Center/scale numerics prior to fitting** (required for `nnet` stability). + +```{r data-prep} +# Record dataset shape before preprocessing +rows_before <- nrow(df) +cols_before <- ncol(df) + +Xy <- df[, unique(c(target, predictors)), drop = FALSE] + +# Track changes +changes <- list() + +# 1) Drop likely IDs (very high cardinality categorical) +is_high_card <- function(x) is.character(x) || is.factor(x) +drop_cols <- names(Xy)[sapply(Xy, function(x) is_high_card(x) && (length(unique(x)) > 0.8*nrow(Xy)))] +if (length(drop_cols)) { + changes <- c(changes, paste0("Dropped high-cardinality non-numeric columns: ", paste(drop_cols, collapse = ", "))) + Xy <- dplyr::select(Xy, -all_of(drop_cols)) + predictors <- setdiff(predictors, drop_cols) +} + +# 2) Coerce low-unique numeric codes to factor +for (nm in names(Xy)) { + x <- Xy[[nm]] + if (is.numeric(x)) { + uniq_ratio <- length(unique(x))/nrow(Xy) + if (uniq_ratio < 0.05 && length(unique(x)) < 13) { + Xy[[nm]] <- factor(x) + changes <- c(changes, paste0("Converted numeric to factor: ", nm)) + } + } +} + +# 3) Target type detection +y <- Xy[[target]] +target_is_classification <- is.factor(y) || is.character(y) +if (is.character(y)) { + Xy[[target]] <- factor(y) + y <- Xy[[target]] + changes <- c(changes, paste0("Converted target variable '", target, "' from character to factor.")) +} + +# 4) Scaling check (numeric predictors) +num_preds <- predictors[sapply(Xy[, predictors, drop = FALSE], is.numeric)] +if (length(num_preds)) { + changes <- c(changes, paste0("Numeric predictors will be centered and scaled: ", paste(num_preds, collapse = ", "))) +} + +# Print summary of preprocessing changes +cat("Data dimensions before preprocessing: ", rows_before, " rows × ", cols_before, " columns\n") +cat("Data dimensions after preprocessing: ", nrow(Xy), " rows × ", ncol(Xy), " columns\n\n") + +if (length(changes) == 0) { + cat("No preprocessing changes were applied.\n") +} else { + cat("Preprocessing changes applied:\n") + for (c in changes) cat(" -", c, "\n") +} +``` + +# Train / Test Split (70/30) for Clear Holdout Evaluation + +```{r split} +if (target_is_classification) { + # Require caret for stratified partition; otherwise do simple split + if (requireNamespace("caret", quietly = TRUE)) { + split <- caret::createDataPartition(Xy[[target]], p = 0.7, list = FALSE) + } else { + split <- sample.int(nrow(Xy), size = floor(0.7*nrow(Xy))) + } +} else { + split <- sample.int(nrow(Xy), size = floor(0.7*nrow(Xy))) +} + +train_df <- Xy[split, , drop = FALSE] +test_df <- Xy[-split, , drop = FALSE] + +# Center/scale numeric predictors +scale_cols <- setdiff(names(train_df), target) +num_cols <- scale_cols[sapply(train_df[, scale_cols, drop = FALSE], is.numeric)] +pp <- NULL +if (length(num_cols)) { + mu <- sapply(train_df[, num_cols, drop = FALSE], mean, na.rm = TRUE) + sdv <- sapply(train_df[, num_cols, drop = FALSE], sd, na.rm = TRUE) + sdv[sdv == 0] <- 1 + train_df[, num_cols] <- sweep(train_df[, num_cols, drop = FALSE], 2, mu, "-") + train_df[, num_cols] <- sweep(train_df[, num_cols, drop = FALSE], 2, sdv, "/") + test_df[, num_cols] <- sweep(test_df[, num_cols, drop = FALSE], 2, mu, "-") + test_df[, num_cols] <- sweep(test_df[, num_cols, drop = FALSE], 2, sdv, "/") + pp <- list(mu = mu, sd = sdv) +} +``` + +# Model Formula + +```{r formula} +# Rebuild predictors in case any were dropped +predictors <- intersect(predictors, names(train_df)) +form <- as.formula(paste(target, "~", paste(predictors, collapse = " + "))) +form +``` + +# Neural Network for **Classification** *(if target is categorical)* + +```{r nnet-classification} +if (target_is_classification) { + train_df[[target]] <- factor(train_df[[target]]) + test_df[[target]] <- factor(test_df[[target]], levels = levels(train_df[[target]])) + + # For binary classification, set explicit positive/negative if provided + if (nlevels(train_df[[target]]) == 2 && !is.null(pos_lab) && (pos_lab %in% levels(train_df[[target]]))) { + train_df[[target]] <- stats::relevel(train_df[[target]], ref = pos_lab) + test_df[[target]] <- factor(test_df[[target]], levels = levels(train_df[[target]])) + } + + cat("Fitting nnet classification model...\n") + nn_model <- nnet::nnet(formula = form, data = train_df, size = size, decay = decay, linout = FALSE, trace = FALSE, MaxNWts = 10000) + + # ===== RAW DUMPS (kept) ===== + cat("\n--- RAW MODEL DUMP ---\n") + print(nn_model) + cat("\n--- SUMMARY(nn_model) ---\n") + print(summary(nn_model)) + cat("\n--- WEIGHTS (nn_model$wts) ---\n") + print(utils::head(nn_model$wts, 50)) # show first 50 for readability + if (length(nn_model$wts) > 50) cat("... (truncated) ...\n") + + # Predictions + if (nlevels(train_df[[target]]) == 2) { + # Binary: nnet type='raw' returns probability of first class level + probs <- predict(nn_model, newdata = test_df, type = "raw") + probs <- as.numeric(probs) + pos_level <- levels(train_df[[target]])[1] + pred_class <- ifelse(probs >= 0.5, pos_level, setdiff(levels(train_df[[target]]), pos_level)) + pred_class <- factor(pred_class, levels = levels(train_df[[target]])) + + # Confusion Matrix + metrics + if (requireNamespace("caret", quietly = TRUE)) { + cm <- caret::confusionMatrix(pred_class, test_df[[target]], positive = pos_level) + cat("\nHoldout Accuracy:", round(cm$overall["Accuracy"], 4), + " | Kappa:", round(cm$overall["Kappa"], 4), "\n") + cat("Precision:", round(cm$byClass["Pos Pred Value"], 4), + " | Recall (Sensitivity):", round(cm$byClass["Sensitivity"], 4), + " | F1:", round(cm$byClass["F1"], 4), "\n") + cm_tbl <- cm$table + } else { + cm_tbl <- table(Prediction = pred_class, Reference = test_df[[target]]) + acc <- sum(diag(cm_tbl)) / sum(cm_tbl) + cat("\nHoldout Accuracy:", round(acc, 4), "(caret not installed; basic accuracy only)\n") + } + + # ROC/AUC + if (requireNamespace("pROC", quietly = TRUE)) { + roc_obj <- pROC::roc(response = test_df[[target]], predictor = probs, quiet = TRUE) + cat("AUC:", round(as.numeric(pROC::auc(roc_obj)), 4), "\n") + plot(roc_obj, main = paste0("ROC Curve (AUC = ", round(as.numeric(pROC::auc(roc_obj)), 3), ")")) + } + + # Confusion matrix heatmap + cm_df <- as.data.frame(cm_tbl) + ggplot(cm_df, aes(x = Prediction, y = Reference, fill = Freq)) + + geom_tile() + geom_text(aes(label = Freq)) + + labs(title = "Confusion Matrix (Holdout)", x = "Predicted", y = "Actual") + + scale_fill_continuous(type = "viridis") + + theme_minimal() + + } else { + # Multiclass + prob_mat <- predict(nn_model, newdata = test_df, type = "raw") + if (is.null(dim(prob_mat))) prob_mat <- cbind(prob_mat) + pred_class <- factor(colnames(prob_mat)[max.col(prob_mat, ties.method = "first")], + levels = levels(train_df[[target]])) + + if (requireNamespace("caret", quietly = TRUE)) { + cm <- caret::confusionMatrix(pred_class, test_df[[target]]) + cat("\nHoldout Accuracy:", round(cm$overall["Accuracy"], 4), + " | Kappa:", round(cm$overall["Kappa"], 4), "\n") + cm_tbl <- cm$table + } else { + cm_tbl <- table(Prediction = pred_class, Reference = test_df[[target]]) + acc <- sum(diag(cm_tbl)) / sum(cm_tbl) + cat("\nHoldout Accuracy:", round(acc, 4), "(caret not installed; basic accuracy only)\n") + } + + # Multiclass AUC (Hand–Till) if available + if (requireNamespace("pROC", quietly = TRUE)) { + all_lvls <- levels(train_df[[target]]) + prob_df <- as.data.frame(prob_mat) + if (!all(all_lvls %in% colnames(prob_df))) { + missing <- setdiff(all_lvls, colnames(prob_df)) + for (m in missing) prob_df[[m]] <- 0 + } + prob_df <- prob_df[, all_lvls, drop = FALSE] + auc_mc <- try(pROC::multiclass.roc(response = test_df[[target]], predictor = as.matrix(prob_df)), silent = TRUE) + if (!inherits(auc_mc, "try-error")) { + cat("Multiclass AUC (Hand–Till):", round(as.numeric(auc_mc$auc), 4), "\n") + } + } + + # Confusion matrix heatmap + cm_df <- as.data.frame(cm_tbl) + ggplot(cm_df, aes(x = Prediction, y = Reference, fill = Freq)) + + geom_tile() + geom_text(aes(label = Freq)) + + labs(title = "Confusion Matrix (Holdout)", x = "Predicted", y = "Actual") + + scale_fill_continuous(type = "viridis") + + theme_minimal() + } +} else { + cat("Target is not categorical; skipping classification branch.\n") +} +``` + +# Neural Network for **Regression** *(if target is numeric)* + +```{r nnet-regression} +if (!target_is_classification) { + train_df[[target]] <- as.numeric(train_df[[target]]) + test_df[[target]] <- as.numeric(test_df[[target]]) + + cat("Fitting nnet regression model...\n") + nn_model_reg <- nnet::nnet(formula = form, data = train_df, size = size, decay = decay, + linout = TRUE, maxit = 1000, trace = FALSE, MaxNWts = 10000) + + # ===== RAW DUMPS (kept) ===== + cat("\n--- RAW MODEL DUMP (REGRESSION) ---\n") + print(nn_model_reg) + cat("\n--- SUMMARY(nn_model_reg) ---\n") + print(summary(nn_model_reg)) + cat("\n--- WEIGHTS (nn_model_reg$wts) ---\n") + print(utils::head(nn_model_reg$wts, 50)) + if (length(nn_model_reg$wts) > 50) cat("... (truncated) ...\n") + + # Predictions & holdout metrics + preds <- as.numeric(predict(nn_model_reg, newdata = test_df)) + actual <- test_df[[target]] + rmse <- sqrt(mean((preds - actual)^2, na.rm = TRUE)) + mae <- mean(abs(preds - actual), na.rm = TRUE) + r2 <- cor(preds, actual, use = "complete.obs")^2 + + cat("\nHoldout metrics:\n") + cat("RMSE:", round(rmse, 4), " | MAE:", round(mae, 4), " | R^2:", round(r2, 4), "\n") + + # Predicted vs Actual plot + plot_df <- data.frame(Actual = actual, Predicted = preds) + ggplot(plot_df, aes(x = Actual, y = Predicted)) + + geom_point(alpha = 0.7) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + labs(title = "Predicted vs Actual (Holdout)", x = "Actual", y = "Predicted") + + theme_minimal() +} else { + cat("Target is not numeric; skipping regression branch.\n") +} +``` + +# Cross-Validated Summary Metrics (Optional) + +If the **`caret`** package is available, we’ll also report **k-fold cross-validated** performance (classification: Accuracy/Kappa and ROC/AUC for binary; regression: RMSE/MAE/R²). If `caret` is not installed, this section will be skipped without failing the report. + +```{r cv-metrics} +if (!requireNamespace("caret", quietly = TRUE)) { + cat("Package 'caret' not installed; skipping cross-validated metrics.\n") +} else { + try({ + # Drop rows with any missing predictor/target values for CV + Xy_cv <- Xy[, unique(c(target, predictors)), drop = FALSE] + Xy_cv <- Xy_cv[stats::complete.cases(Xy_cv), , drop = FALSE] + + # Ensure we still have predictors + preds_cv <- intersect(predictors, names(Xy_cv)) + if (length(preds_cv) == 0L) stop("No valid predictors available for CV after preprocessing.") + + # Outcome typing + level sanitization so caret/nnet are happy + y_cv <- Xy_cv[[target]] + is_class <- is.factor(y_cv) || is.character(y_cv) + if (is.character(y_cv)) Xy_cv[[target]] <- factor(y_cv) + + if (is_class) { + Xy_cv[[target]] <- droplevels(factor(Xy_cv[[target]])) + + # Sanitize class level names to valid R identifiers (prevents caret errors) + orig_lvls_cv <- levels(Xy_cv[[target]]) + sane_lvls_cv <- make.names(orig_lvls_cv, unique = TRUE) + if (!identical(orig_lvls_cv, sane_lvls_cv)) { + levels(Xy_cv[[target]]) <- sane_lvls_cv + cat("Sanitized class levels for CV: ", + paste(paste0(orig_lvls_cv, "→", sane_lvls_cv), collapse = ", "), "\n") + } + + if (nlevels(Xy_cv[[target]]) < 2) stop("Target has fewer than 2 levels for classification CV.") + + # Relevel positive class if provided (sanitize the label too) + if (nlevels(Xy_cv[[target]]) == 2) { + pos_lab_cv <- pos_lab + if (!is.null(pos_lab_cv)) pos_lab_cv <- make.names(pos_lab_cv, unique = TRUE) + + if (!is.null(pos_lab_cv) && (pos_lab_cv %in% levels(Xy_cv[[target]]))) { + Xy_cv[[target]] <- stats::relevel(Xy_cv[[target]], ref = pos_lab_cv) + cat("CV positive class set to:", pos_lab_cv, "\n") + } else { + message("Binary classification (CV): positive class set to first level: ", + levels(Xy_cv[[target]])[1]) + } + } + } + + # Folds guard + k <- max(2L, min(cv_folds, nrow(Xy_cv) - 1L)) + + ctrl <- caret::trainControl( + method = "cv", + number = k, + classProbs = is_class, + summaryFunction = if (is_class && nlevels(Xy_cv[[target]]) == 2) + caret::twoClassSummary else caret::defaultSummary, + savePredictions = "final", + allowParallel = FALSE + ) + + form_cv <- stats::as.formula(paste(target, "~", paste(preds_cv, collapse = " + "))) + metric <- if (is_class && nlevels(Xy_cv[[target]]) == 2) "ROC" else if (is_class) "Accuracy" else "RMSE" + + fit_cv <- caret::train( + form_cv, data = Xy_cv, + method = "nnet", + trControl = ctrl, + tuneGrid = data.frame(size = size, decay = decay), + preProcess = c("center", "scale"), + linout = !is_class, + trace = FALSE, + MaxNWts = 10000 + ) + + cat("\nCross-validated results (caret):\n") + print(fit_cv$results) + + if (metric == "ROC" && "ROC" %in% names(fit_cv$results)) { + cat("Mean ROC (CV):", round(max(fit_cv$results$ROC), 4), "\n") + } + }, silent = FALSE) +} +``` + +# Notes & Next Steps + +- **Tuning**: Increase `size` or adjust `decay` to trade off bias/variance. For automated tuning, expand to a grid (multiple `size`/`decay` combos) and pick the best by CV. +- **Class imbalance**: For skewed classification problems, consider stratified sampling and threshold tuning beyond 0.5 (Youden’s J, cost-sensitive metrics). +- **Stability**: For more stable metrics, increase `cv_folds`. +- **Reproducibility**: Keep the `set.seed()` and document your parameters (see **Parameter Glossary** above).