diff --git a/DESCRIPTION b/DESCRIPTION index 94dc35c..35e75d3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,4 +55,4 @@ LazyData: yes LazyDataCompression: xz LazyLoad: yes Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index da53ada..7c64803 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,10 +7,8 @@ export(get_census_data) export(predict_race) import(PL94171) importFrom(Rcpp,evalCpp) -importFrom(dplyr,coalesce) importFrom(dplyr,pull) importFrom(furrr,future_map_dfr) -importFrom(piggyback,pb_download) importFrom(purrr,map_dfr) importFrom(rlang,"%||%") importFrom(stats,rmultinom) diff --git a/R/merge_names.R b/R/merge_names.R index c1e3d8e..4f5d7f2 100644 --- a/R/merge_names.R +++ b/R/merge_names.R @@ -1,102 +1,141 @@ +# ------------------------------ +# Helper: resolve cache location +# ------------------------------ +wru_cache_dir <- function() { + opt <- getOption("wru_data_wd", default = FALSE) + if (isTRUE(opt)) { + # TRUE means: use current working directory + return(getwd()) + } + if (is.character(opt) && length(opt) == 1 && !is.na(opt) && nzchar(opt)) { + dir.create(path.expand(opt), recursive = TRUE, showWarnings = FALSE) + return(path.expand(opt)) + } + # default: tempdir for this session + tempdir() +} + +#' Preflight for name data (persistent, pinned, and verified) +#' Downloads 4 name dictionaries to a persistent directory if missing. +#' Set options(wru_data_wd = TRUE) to cache in getwd(); otherwise uses tempdir(). +wru_data_preflight <- function() { + dest <- ifelse(getOption("wru_data_wd", default = FALSE), getwd(), tempdir()) + files_needed <- c( + "wru-data-first_c.rds", + "wru-data-mid_c.rds", + "wru-data-census_last_c.rds", + "wru-data-last_c.rds" + ) + missing <- files_needed[!file.exists(file.path(dest, files_needed))] + if (length(missing) == 0L) return(invisible(TRUE)) + + # Use the release that actually has these assets (matches CRAN code). + tag_to_use <- "v2.0.0" + + # Try download; piggyback ignores .token when "" (same as CRAN code). + tryCatch( + { + piggyback::pb_download( + file = missing, + repo = "kosukeimai/wru", + tag = tag_to_use, + dest = dest, + overwrite = TRUE, + .token = "" + ) + }, + error = function(e) { + stop( + sprintf( + "wru_data_preflight() failed to fetch: %s\nReason: %s", + paste(missing, collapse = ", "), + conditionMessage(e) + ), + call. = FALSE + ) + } + ) + + # Verify they arrived; if not, guide the user. + still_missing <- files_needed[!file.exists(file.path(dest, files_needed))] + if (length(still_missing) > 0L) { + stop( + paste0( + "wru_data_preflight(): some files are still missing after download: ", + paste(still_missing, collapse = ", "), + "\nCheck network/GitHub access or download these assets from the ", + "wru v2.0.0 release and place them in: ", normalizePath(dest), + "\nThen re-run." + ), + call. = FALSE + ) + } + invisible(TRUE) +} + #' Surname probability merging function. #' #' \code{merge_names} merges names in a user-input dataset with corresponding -#' race/ethnicity probabilities derived from both the U.S. Census Surname List -#' and Spanish Surname List and voter files from states in the Southern U.S. -#' -#' This function allows users to match names in their dataset with database entries -#' estimating P(name | ethnicity) for each of the five major racial groups for each -#' name. The database probabilities are derived from both the U.S. Census Surname List -#' and Spanish Surname List and voter files from states in the Southern U.S. -#' -#' By default, the function matches names as follows: -#' 1) Search raw surnames in the database; -#' 2) Remove any punctuation and search again; -#' 3) Remove any spaces and search again; -#' 4) Remove suffixes (e.g., "Jr") and search again (last names only) -#' 5) Split double-barreled names into two parts and search first part of name; -#' 6) Split double-barreled names into two parts and search second part of name; -#' -#' Each step only applies to names not matched in a previous step. -#' Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. -#' -#' Note: Any name appearing only on the Spanish Surname List is assigned a -#' probability of 1 for Hispanics/Latinos and 0 for all other racial groups. +#' race/ethnicity probabilities derived from both the U.S. Census Surname List +#' and Spanish Surname List and voter files from states in the Southern U.S. #' -#' @param voter.file An object of class \code{data.frame}. Must contain a row for each individual being predicted, -#' as well as a field named \code{\var{last}} containing each individual's surname. -#' If first name is also being used for prediction, the file must also contain a field -#' named \code{\var{first}}. If middle name is also being used for prediction, the field -#' must also contain a field named \code{\var{middle}}. -#' @param namesToUse A character vector identifying which names to use for the prediction. -#' The default value is \code{"last"}, indicating that only the last name will be used. -#' Other options are \code{"last, first"}, indicating that both last and first names will be -#' used, and \code{"last, first, middle"}, indicating that last, first, and middle names will all -#' be used. -#' @param census.surname A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -#' function will call \code{merge_surnames} to merge in Pr(Race | Surname) -#' from U.S. Census Surname List (2000, 2010, or 2020) and Spanish Surname List. -#' If \code{FALSE}, user must provide a \code{name.dictionary} (see below). -#' Default is \code{TRUE}. -#' @param table.surnames An object of class \code{data.frame} provided by the -#' users as an alternative surname dictionary. It will consist of a list of -#' U.S. surnames, along with the associated probabilities P(name | ethnicity) -#' for ethnicities: white, Black, Hispanic, Asian, and other. Default is \code{NULL}. -#' (\code{\var{last_name}} for U.S. surnames, \code{\var{p_whi_last}} for White, -#' \code{\var{p_bla_last}} for Black, \code{\var{p_his_last}} for Hispanic, -#' \code{\var{p_asi_last}} for Asian, \code{\var{p_oth_last}} for other). -#' @param table.first See \code{\var{table.surnames}}. -#' @param table.middle See \code{\var{table.surnames}}. -#' @param impute.missing See \code{predict_race}. -#' @param model See \code{predict_race}. -#' @param clean.names A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -#' any surnames in \code{\var{voter.file}} that cannot initially be matched -#' to the database will be cleaned, according to U.S. Census specifications, -#' in order to increase the chance of finding a match. Default is \code{TRUE}. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns that -#' specify the part of the name matched with Census data (\code{\var{surname.match}}), -#' and the probabilities Pr(Race | Surname) for each racial group -#' (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, -#' \code{\var{p_his}} for Hispanic/Latino, -#' \code{\var{p_asi}} for Asian and Pacific Islander, and -#' \code{\var{p_oth}} for Other/Mixed). -#' @importFrom dplyr coalesce -#' @examples -#' data(voters) -#' \dontrun{try(merge_names(voters, namesToUse = "surname", census.surname = TRUE))} +#' @param voter.file data.frame with at least a \code{surname} column; \code{first} and \code{middle} if used +#' @param namesToUse "surname", "surname, first", or "surname, first, middle" +#' @param census.surname logical; if TRUE, use Census/Spanish surname list +#' @param table.surnames Optional custom last-name dictionary (schema like shipped RDS) +#' @param table.first Optional custom first-name dictionary +#' @param table.middle Optional custom middle-name dictionary +#' @param clean.names logical; try cleaning/fallback passes +#' @param impute.missing logical; impute c_* NA with column means (else 1) +#' @param model kept for API parity +#' @param return.unmatched logical; add *_matched boolean columns +#' @return data.frame with probabilities (and flags if requested) #' @keywords internal -merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = NULL, table.first = NULL, table.middle = NULL, clean.names = TRUE, impute.missing = FALSE, model = "BISG") { - - # check the names +merge_names <- function( + voter.file, + namesToUse, + census.surname, + table.surnames = NULL, + table.first = NULL, + table.middle = NULL, + clean.names = TRUE, + return.unmatched = TRUE, + impute.missing = FALSE, + model = "BISG" +) { + # --- arg checks ------------------------------------------------------------ if (namesToUse == "surname") { if (!("surname" %in% names(voter.file))) { stop("Voter data frame needs to have a column named 'surname'.") } } else if (namesToUse == "surname, first") { - if (!("surname" %in% names(voter.file)) || !("first" %in% names(voter.file))) { - stop("Voter data frame needs to have a column named 'surname' and a column called 'first'.") + if (!all(c("surname","first") %in% names(voter.file))) { + stop("Voter data frame needs 'surname' and 'first'.") } } else if (namesToUse == "surname, first, middle") { - if (!("surname" %in% names(voter.file)) || !("first" %in% names(voter.file)) || - !("middle" %in% names(voter.file))) { - stop("Voter data frame needs to have a column named 'surname', a column called 'first', and a column called 'middle'.") + if (!all(c("surname","first","middle") %in% names(voter.file))) { + stop("Voter data frame needs 'surname', 'first', and 'middle'.") } } - + + # --- ensure data present (downloads only if missing) ----------------------- wru_data_preflight() - - path <- ifelse(getOption("wru_data_wd", default = FALSE), getwd(), tempdir()) - - first_c <- readRDS(paste0(path, "/wru-data-first_c.rds")) - mid_c <- readRDS(paste0(path, "/wru-data-mid_c.rds")) - if(census.surname){ - last_c <- readRDS(paste0(path, "/wru-data-census_last_c.rds")) - } else { - last_c <- readRDS(paste0(path, "/wru-data-last_c.rds")) + + path <- wru_cache_dir() + first_c <- readRDS(file.path(path, "wru-data-first_c.rds")) + mid_c <- readRDS(file.path(path, "wru-data-mid_c.rds")) + last_c <- readRDS(file.path( + path, + if (isTRUE(census.surname)) "wru-data-census_last_c.rds" else "wru-data-last_c.rds" + )) + + if (!is.data.frame(first_c) || !is.data.frame(mid_c) || !is.data.frame(last_c)) { + stop("wru cached name dictionaries are not data.frames; clear cache and re-run wru_data_preflight().") } p_eth <- c("c_whi", "c_bla", "c_his", "c_asi", "c_oth") + + # optional custom dictionaries; align names to shipped schema if (is.null(table.surnames)) { lastNameDict <- last_c } else { @@ -108,26 +147,22 @@ merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = firstNameDict <- first_c } else { firstNameDict <- table.first - firstNameDict[is.na(firstNameDict)] <- 0 names(firstNameDict) <- names(first_c) + firstNameDict[is.na(firstNameDict)] <- 0 } if (is.null(table.middle)) { middleNameDict <- mid_c } else { middleNameDict <- table.middle - middleNameDict[is.na(middleNameDict)] <- 0 names(middleNameDict) <- names(mid_c) + middleNameDict[is.na(middleNameDict)] <- 0 } - - nameDict <- list( - "first" = firstNameDict, - "middle" = middleNameDict, - "last" = lastNameDict - ) - - ## Convert names in voter file to upper case + + nameDict <- list(first = firstNameDict, middle = middleNameDict, last = lastNameDict) + + # --- initial normalization & merges --------------------------------------- df <- voter.file - + df$lastname.match <- df$lastname.upper <- toupper(as.character(df$surname)) if (grepl("first", namesToUse)) { df$firstname.match <- df$firstname.upper <- toupper(as.character(df$first)) @@ -136,8 +171,8 @@ merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = df$middlename.match <- df$middlename.upper <- toupper(as.character(df$middle)) df$middlename.match[is.na(df$middlename.match)] <- "" } - - ## Merge Surnames with Census List (No Cleaning Yet) + + # first pass merges (raw) df <- merge(df, lastNameDict, by.x = "lastname.match", by.y = "last_name", all.x = TRUE, sort = FALSE) if (grepl("first", namesToUse)) { df <- merge(df, firstNameDict, by.x = "firstname.match", by.y = "first_name", all.x = TRUE, sort = FALSE) @@ -145,201 +180,255 @@ merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = if (grepl("middle", namesToUse)) { df <- merge(df, middleNameDict, by.x = "middlename.match", by.y = "middle_name", all.x = TRUE, sort = FALSE) } - - if (namesToUse == "surname" && sum(!(df$lastname.upper %in% lastNameDict$last_name)) == 0) { - return(df[, c(names(voter.file), "lastname.match", paste0(p_eth, "_last"))]) + + # compute flags now so early-returns also include them + if (isTRUE(return.unmatched)) { + if ("c_whi_last" %in% names(df)) df$last_matched <- !is.na(df$c_whi_last) + if ("c_whi_first" %in% names(df)) df$first_matched <- !is.na(df$c_whi_first) + if ("c_whi_middle" %in% names(df)) df$middle_matched <- !is.na(df$c_whi_middle) } - if (namesToUse == "surname, first" && sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && - sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0) { - return(df[, c(names(voter.file), "lastname.match", "firstname.match", paste0(p_eth, "_last"), paste0(p_eth, "_first"))]) + + # unified early-return helper + fast_return <- function() { + prob_cols <- c(paste0(p_eth, "_last")) + name_cols <- c("lastname.match") + flag_cols <- character(0) + if (isTRUE(return.unmatched) && "last_matched" %in% names(df)) flag_cols <- c(flag_cols, "last_matched") + if (grepl("first", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_first")) + name_cols <- c(name_cols, "firstname.match") + if (isTRUE(return.unmatched) && "first_matched" %in% names(df)) flag_cols <- c(flag_cols, "first_matched") + } + if (grepl("middle", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_middle")) + name_cols <- c(name_cols, "middlename.match") + if (isTRUE(return.unmatched) && "middle_matched" %in% names(df)) flag_cols <- c(flag_cols, "middle_matched") + } + cols <- c(names(voter.file), name_cols, flag_cols, prob_cols) + df[, cols, drop = FALSE] } - if (namesToUse == "surname, first, middle" && sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && - sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0 && sum(!(df$middlename.upper %in% middleNameDict$middle_name)) == 0) { - return(df[, c(names(voter.file), "lastname.match", "firstname.match", "middlename.match", paste0(p_eth, "_last"), paste0(p_eth, "_first"), paste0(p_eth, "_middle"))]) + + if (namesToUse == "surname" && + sum(!(df$lastname.upper %in% lastNameDict$last_name)) == 0) { + return(fast_return()) } - - ## Clean names (if specified by user) + if (namesToUse == "surname, first" && + sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && + sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0) { + return(fast_return()) + } + if (namesToUse == "surname, first, middle" && + sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && + sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0 && + sum(!(df$middlename.upper %in% middleNameDict$middle_name))== 0) { + return(fast_return()) + } + + # --- cleaning / fallback passes (dimension-safe) -------------------------- if (clean.names) { - for (nameType in strsplit(namesToUse, ", ")[[1]]) { - if(nameType=="surname"){ - nameType <- "last" - } - df1 <- df[!is.na(df[, paste("c_whi_", nameType, sep = "")]), ] # Matched names - df2 <- df[is.na(df[, paste("c_whi_", nameType, sep = "")]), ] # Unmatched names - - ## Remove All Punctuation and Try Merge Again + for (nameType0 in strsplit(namesToUse, ", ")[[1]]) { + nameType <- if (nameType0 == "surname") "last" else nameType0 + + col_prob <- paste0("c_whi_", nameType) + col_match <- paste0(nameType, "name.match") + col_upper <- paste0(nameType, "name.upper") + + df1 <- df[!is.na(df[, col_prob]), , drop = FALSE] + df2 <- df[ is.na(df[, col_prob]), , drop = FALSE] + + ## 1) strip punctuation if (nrow(df2) > 0) { - df2[, paste(nameType, "name.match", sep = "")] <- gsub("[^[:alnum:] ]", "", df2[, paste(nameType, "name.upper", sep = "")]) - - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + df2[, col_match] <- gsub("[^[:alnum:] ]", "", df2[, col_upper]) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, names(df1)] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, names(df1), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, , drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - ## Remove All Spaces and Try Merge Again + + ## 2) strip spaces if (nrow(df2) > 0) { - df2[, paste(nameType, "name.match", sep = "")] <- gsub(" ", "", df2[, paste(nameType, "name.match", sep = "")]) - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + df2[, col_match] <- gsub(" ", "", df2[, col_match]) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, names(df1)] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, names(df1), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, , drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - # Edits specific to common issues with last names - if (nameType == "last" & nrow(df2) > 0) { - - ## Remove Jr/Sr/III Suffixes for last names - suffix <- c("JUNIOR", "SENIOR", "THIRD", "III", "JR", " II", " J R", " S R", " IV") - for (i in 1:length(suffix)) { - df2$lastname.match <- ifelse(substr(df2$lastname.match, nchar(df2$lastname.match) - (nchar(suffix)[i] - 1), nchar(df2$lastname.match)) == suffix[i], - substr(df2$lastname.match, 1, nchar(df2$lastname.match) - nchar(suffix)[i]), + + ## 3) last-name suffix cleanup + if (nameType == "last" && nrow(df2) > 0) { + suffix <- c("JUNIOR","SENIOR","THIRD","III","JR"," II"," J R"," S R"," IV") + for (suf in suffix) { + df2$lastname.match <- ifelse( + substr(df2$lastname.match, + nchar(df2$lastname.match) - (nchar(suf) - 1), + nchar(df2$lastname.match)) == suf, + substr(df2$lastname.match, 1, nchar(df2$lastname.match) - nchar(suf)), df2$lastname.match ) } - df2$lastname.match <- ifelse(nchar(df2$lastname.match) >= 7, - ifelse(substr(df2$lastname.match, nchar(df2$lastname.match) - 1, nchar(df2$lastname.match)) == "SR", - substr(df2$lastname.match, 1, nchar(df2$lastname.match) - 2), - df2$lastname.match - ), + df2$lastname.match <- ifelse( + nchar(df2$lastname.match) >= 7 & + substr(df2$lastname.match, nchar(df2$lastname.match) - 1, nchar(df2$lastname.match)) == "SR", + substr(df2$lastname.match, 1, nchar(df2$lastname.match) - 2), df2$lastname.match - ) # Remove "SR" only if name has at least 7 characters - + ) + df2 <- merge( - df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], - lastNameDict, by.x = "lastname.match", by.y = "last_name", - all.x = TRUE, sort = FALSE) - df2 <- df2[, names(df1)] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2[, !grepl(paste0("_", nameType), names(df2))], + lastNameDict, + by.x = "lastname.match", by.y = "last_name", + all.x = TRUE, sort = FALSE + ) + df2 <- df2[, names(df1), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, , drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - - ## Names with Hyphens or Spaces, e.g. Double-Barreled Names + + ## 4) double-barreled (first half then second half) if (nrow(df2) > 0) { df2$name2 <- df2$name1 <- NA - df2$name1[grep("-", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep("-", df2[, paste(nameType, "name.upper", sep = "")], value = T), "-"), "[", 1) - df2$name2[grep("-", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep("-", df2[, paste(nameType, "name.upper", sep = "")], value = T), "-"), "[", 2) - df2$name1[grep(" ", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep(" ", df2[, paste(nameType, "name.upper", sep = "")], value = T), " "), "[", 1) - df2$name2[grep(" ", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep(" ", df2[, paste(nameType, "name.upper", sep = "")], value = T), " "), "[", 2) - - ## Use first half of name to merge in priors - df2[, paste(nameType, "name.match", sep = "")] <- as.character(df2$name1) - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + + # hyphen split + idx <- grep("-", df2[, col_upper]) + if (length(idx)) { + parts <- strsplit(df2[idx, col_upper], "-") + df2$name1[idx] <- vapply(parts, `[`, character(1), 1) + df2$name2[idx] <- vapply(parts, `[`, character(1), 2) + } + # space split (fill only NAs left by hyphen step) + idx <- grep(" ", df2[, col_upper]) + if (length(idx)) { + parts <- strsplit(df2[idx, col_upper], " ") + fill1 <- vapply(parts, `[`, character(1), 1) + fill2 <- vapply(parts, `[`, character(1), 2) + df2$name1[idx] <- ifelse(is.na(df2$name1[idx]), fill1, df2$name1[idx]) + df2$name2[idx] <- ifelse(is.na(df2$name2[idx]), fill2, df2$name2[idx]) + } + + # first half + df2[, col_match] <- as.character(df2$name1) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, c(names(df1), "name1", "name2")] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, c(names(df1), "name1", "name2"), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, !(names(df2) %in% c("name1","name2")), drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - ## Use second half of name to merge in priors for rest + if (nrow(df2) > 0) { - df2[, paste(nameType, "name.match", sep = "")] <- as.character(df2$name2) - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + # second half + df2[, col_match] <- as.character(df2$name2) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, c(names(df1), "name1", "name2")] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, c(names(df1), "name1", "name2"), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, !(names(df2) %in% c("name1","name2")), drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - + + # stitch back if (nrow(df2) > 0) { - df <- rbind(df1, df2[, !(names(df2) %in% c("name1", "name2"))]) + df <- rbind(df1, df2[, !(names(df2) %in% c("name1","name2")), drop = FALSE]) } else { df <- df1 } } } - - - ## For unmatched names, just fill with an column mean if impute is true, or with constant if false + + # --- reporting ------------------------------------------------------------- c_miss_last <- mean(is.na(df$c_whi_last)) if (c_miss_last > 0) { - message(paste(paste(sum(is.na(df$c_whi_last)), " (", round(100 * mean(is.na(df$c_whi_last)), 1), "%) individuals' last names were not matched.", sep = ""))) + message(paste0(sum(is.na(df$c_whi_last)), " (", round(100 * c_miss_last, 1), "%) individuals' last names were not matched.")) } if (grepl("first", namesToUse)) { c_miss_first <- mean(is.na(df$c_whi_first)) if (c_miss_first > 0) { - message(paste(paste(sum(is.na(df$c_whi_first)), " (", round(100 * mean(is.na(df$c_whi_first)), 1), "%) individuals' first names were not matched.", sep = ""))) + message(paste0(sum(is.na(df$c_whi_first)), " (", round(100 * c_miss_first, 1), "%) individuals' first names were not matched.")) } } if (grepl("middle", namesToUse)) { c_miss_mid <- mean(is.na(df$c_whi_middle)) if (c_miss_mid > 0) { - message(paste(paste(sum(is.na(df$c_whi_middle)), " (", round(100 * mean(is.na(df$c_whi_middle)), 1), "%) individuals' middle names were not matched.", sep = ""))) + message(paste0(sum(is.na(df$c_whi_middle)), " (", round(100 * c_miss_mid, 1), "%) individuals' middle names were not matched.")) } } - + + # flags BEFORE imputation + if (isTRUE(return.unmatched)) { + if ("c_whi_last" %in% names(df)) df$last_matched <- !is.na(df$c_whi_last) + if ("c_whi_first" %in% names(df)) df$first_matched <- !is.na(df$c_whi_first) + if ("c_whi_middle" %in% names(df)) df$middle_matched <- !is.na(df$c_whi_middle) + } + + # --- imputation ------------------------------------------------------------ if (impute.missing) { - impute.vec <- colMeans(df[, grep("c_", names(df), value = TRUE)], na.rm = TRUE) - for (i in grep("c_", names(df), value = TRUE)) { + impute.vec <- colMeans(df[, grep("^c_", names(df), value = TRUE), drop = FALSE], na.rm = TRUE) + for (i in grep("^c_", names(df), value = TRUE)) { df[, i] <- dplyr::coalesce(df[, i], impute.vec[i]) } } else { - for (i in grep("c_", names(df), value = TRUE)) { + for (i in grep("^c_", names(df), value = TRUE)) { df[, i] <- dplyr::coalesce(df[, i], 1) } } - - # return the data - if (namesToUse == "surname") { - return(df[, c(names(voter.file), "lastname.match", paste(p_eth, "last", sep = "_"))]) - } else if (namesToUse == "surname, first") { - return(df[, c( - names(voter.file), "lastname.match", "firstname.match", - paste(p_eth, "last", sep = "_"), paste(p_eth, "first", sep = "_") - )]) - } else if (namesToUse == "surname, first, middle") { - return(df[, c( - names(voter.file), "lastname.match", "firstname.match", "middlename.match", - paste(p_eth, "last", sep = "_"), paste(p_eth, "first", sep = "_"), paste(p_eth, "middle", sep = "_") - )]) + + # --- unified return -------------------------------------------------------- + prob_cols <- c(paste0(p_eth, "_last")) + name_cols <- c("lastname.match") + flag_cols <- character(0) + if (isTRUE(return.unmatched) && "last_matched" %in% names(df)) flag_cols <- c("last_matched", flag_cols) + + if (grepl("first", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_first")) + name_cols <- c(name_cols, "firstname.match") + if (isTRUE(return.unmatched) && "first_matched" %in% names(df)) flag_cols <- c(flag_cols, "first_matched") } -} - - -#' Preflight for name data -#' -#' Checks if namedata is available in the current working directory, if not -#' downloads it from github using piggyback. By default, wru will download the -#' data to a temporary directory that lasts as long as your session does. -#' However, you may wish to set the \code{wru_data_wd} option to save the -#' downloaded data to your current working directory for more permanence. -#' -#' @importFrom piggyback pb_download -wru_data_preflight <- function() { - dest <- ifelse(getOption("wru_data_wd", default = FALSE), getwd(), tempdir()) - tryCatch( - # Oddity of conditions for .token. Ignores token if is "" - piggyback::pb_download(repo = "kosukeimai/wru", dest = dest, .token = "", tag = "v2.0.0"), - error = function(e) message("There was an error retrieving data: ", e$message) - ) + if (grepl("middle", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_middle")) + name_cols <- c(name_cols, "middlename.match") + if (isTRUE(return.unmatched) && "middle_matched" %in% names(df)) flag_cols <- c(flag_cols, "middle_matched") + } + + cols <- c(names(voter.file), name_cols, flag_cols, prob_cols) + df[, cols, drop = FALSE] } diff --git a/R/predict_race.R b/R/predict_race.R index 1f31b7c..eddb1b1 100644 --- a/R/predict_race.R +++ b/R/predict_race.R @@ -74,6 +74,8 @@ #' Whatever the name of the party registration field in \code{\var{voter.file}}, #' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other. #' @param retry The number of retries at the census website if network interruption occurs. +#' @param return.unmatched Generates Boolean columns for each name reporting +#' whether a match was made. Default is set to \code{TRUE}. #' @param impute.missing Logical, defaults to TRUE. Should missing be imputed? #' @param skip_bad_geos Logical. Option to have the function skip any geolocations that are not present #' in the census data, returning a partial data set. Default is set to \code{FALSE}, in which case it @@ -152,6 +154,7 @@ predict_race <- function( year = "2020", party = NULL, retry = 3, + return.unmatched = TRUE, impute.missing = TRUE, skip_bad_geos = FALSE, use.counties = FALSE, @@ -223,6 +226,7 @@ predict_race <- function( surname.only=surname.only, census.data = census.data, retry = retry, + return.unmatched = return.unmatched, impute.missing = impute.missing, skip_bad_geos = skip_bad_geos, census.surname = census.surname, @@ -253,6 +257,7 @@ predict_race <- function( surname.only=surname.only, census.data = census.data, retry = retry, + return.unmatched = TRUE, impute.missing = TRUE, skip_bad_geos = skip_bad_geos, census.surname = census.surname, diff --git a/R/race_prediction_funs.R b/R/race_prediction_funs.R index 77ff933..7fb09e5 100644 --- a/R/race_prediction_funs.R +++ b/R/race_prediction_funs.R @@ -25,6 +25,7 @@ #' @param year See documentation in \code{race_predict}. #' @param party See documentation in \code{race_predict}. #' @param retry See documentation in \code{race_predict}. +#' @param return.unmatched See documentation in \code{race_predict}. #' @param impute.missing See documentation in \code{race_predict}. #' @param skip_bad_geos See documentation in \code{race_predict}. #' @param names.to.use See documentation in \code{race_predict}. @@ -60,6 +61,7 @@ NULL year = "2020", party, retry = 3, + return.unmatched = TRUE, impute.missing = TRUE, use.counties = FALSE ) { @@ -282,6 +284,7 @@ predict_race_new <- function( surname.only=FALSE, census.data = NULL, retry = 0, + return.unmatched = TRUE, impute.missing = TRUE, skip_bad_geos = FALSE, census.surname = FALSE, @@ -397,6 +400,7 @@ predict_race_new <- function( table.first=name.dictionaries[["first"]], table.middle=name.dictionaries[["middle"]], clean.names = TRUE, + return.unmatched = return.unmatched, impute.missing = impute.missing, model = 'BISG') @@ -425,7 +429,12 @@ predict_race_new <- function( ## Revert to national Pr(Race) for missing predictions colnames(preds) <- paste("pred", eth, sep = ".") - return(data.frame(cbind(voter.file[c(vars.orig)], preds))) + if(return.unmatched == TRUE) { + return(data.frame(cbind(voter.file, preds)) |> + dplyr::select(dplyr::all_of(vars.orig), dplyr::any_of(c("last", "first", "middle")), dplyr::ends_with("_matched"), -dplyr::starts_with("c_"),-dplyr::ends_with(".match"), dplyr::starts_with("pred."))) + } else { + return(data.frame(cbind(voter.file[c(vars.orig)], preds))) + } } @@ -448,6 +457,7 @@ predict_race_me <- function( surname.only = FALSE, census.data = NULL, retry = 0, + return_missing = TRUE, impute.missing = TRUE, census.surname = FALSE, use.counties = FALSE, diff --git a/README.Rmd b/README.Rmd index ccf678a..1cc5a1e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -43,10 +43,15 @@ pak::pkg_install("kosukeimai/wru") Here is a simple example that predicts the race/ethnicity of voters based only on their surnames. +Note: Sample data `voters` is based on 2010 Census, therefore we fix that variable here. Default +is the latest Census release year (2020). See `get_census_data()` to retrieve Census data related to your +data set. If you are getting an error similar to setting `year = 2020`, this is what needs to be + addressed. + ``` r library(wru) future::plan(future::multisession) -predict_race(voter.file = voters, surname.only = TRUE) +predict_race(voter.file = voters, surname.only = TRUE, year = 2010) ``` The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., `pred.his` is the probability of being Hispanic/Latino): @@ -86,7 +91,7 @@ Note that a valid API key must be stored in a `CENSUS_API_KEY` environment varia ``` r library(wru) -predict_race(voter.file = voters, census.geo = "tract", party = "PID") +predict_race(voter.file = voters, year = 2010, census.geo = "tract", party = "PID") ``` ``` VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth @@ -106,7 +111,7 @@ In `predict_race()`, the `census.geo` options are "county", "tract", "block" and Here is an example of prediction based on census statistics collected at the level of "place": ``` r -predict_race(voter.file = voters, census.geo = "place", party = "PID") +predict_race(voter.file = voters, year = 2010, census.geo = "place", party = "PID") ``` ``` VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth @@ -137,7 +142,7 @@ In this case, predictions are conditioned on age but not sex, so `age = TRUE` an library(wru) voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] census.dc.nj <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE) -predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") +predict_race(voter.file = voters.dc.nj, year = 2010, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") ``` This produces the same result as the following statement, which downloads census data during evaluation rather than using pre-downloaded data: diff --git a/man/merge_names.Rd b/man/merge_names.Rd index ab98bee..1019c6a 100644 --- a/man/merge_names.Rd +++ b/man/merge_names.Rd @@ -12,89 +12,38 @@ merge_names( table.first = NULL, table.middle = NULL, clean.names = TRUE, + return.unmatched = TRUE, impute.missing = FALSE, model = "BISG" ) } \arguments{ -\item{voter.file}{An object of class \code{data.frame}. Must contain a row for each individual being predicted, -as well as a field named \code{\var{last}} containing each individual's surname. -If first name is also being used for prediction, the file must also contain a field -named \code{\var{first}}. If middle name is also being used for prediction, the field -must also contain a field named \code{\var{middle}}.} +\item{voter.file}{data.frame with at least a \code{surname} column; \code{first} and \code{middle} if used} -\item{namesToUse}{A character vector identifying which names to use for the prediction. -The default value is \code{"last"}, indicating that only the last name will be used. -Other options are \code{"last, first"}, indicating that both last and first names will be -used, and \code{"last, first, middle"}, indicating that last, first, and middle names will all -be used.} +\item{namesToUse}{"surname", "surname, first", or "surname, first, middle"} -\item{census.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -function will call \code{merge_surnames} to merge in Pr(Race | Surname) -from U.S. Census Surname List (2000, 2010, or 2020) and Spanish Surname List. -If \code{FALSE}, user must provide a \code{name.dictionary} (see below). -Default is \code{TRUE}.} +\item{census.surname}{logical; if TRUE, use Census/Spanish surname list} -\item{table.surnames}{An object of class \code{data.frame} provided by the -users as an alternative surname dictionary. It will consist of a list of -U.S. surnames, along with the associated probabilities P(name | ethnicity) -for ethnicities: white, Black, Hispanic, Asian, and other. Default is \code{NULL}. -(\code{\var{last_name}} for U.S. surnames, \code{\var{p_whi_last}} for White, -\code{\var{p_bla_last}} for Black, \code{\var{p_his_last}} for Hispanic, -\code{\var{p_asi_last}} for Asian, \code{\var{p_oth_last}} for other).} +\item{table.surnames}{Optional custom last-name dictionary (schema like shipped RDS)} -\item{table.first}{See \code{\var{table.surnames}}.} +\item{table.first}{Optional custom first-name dictionary} -\item{table.middle}{See \code{\var{table.surnames}}.} +\item{table.middle}{Optional custom middle-name dictionary} -\item{clean.names}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -any surnames in \code{\var{voter.file}} that cannot initially be matched -to the database will be cleaned, according to U.S. Census specifications, -in order to increase the chance of finding a match. Default is \code{TRUE}.} +\item{clean.names}{logical; try cleaning/fallback passes} -\item{impute.missing}{See \code{predict_race}.} +\item{return.unmatched}{logical; add *_matched boolean columns} -\item{model}{See \code{predict_race}.} +\item{impute.missing}{logical; impute c_* NA with column means (else 1)} + +\item{model}{kept for API parity} } \value{ -Output will be an object of class \code{data.frame}. It will -consist of the original user-input data with additional columns that -specify the part of the name matched with Census data (\code{\var{surname.match}}), -and the probabilities Pr(Race | Surname) for each racial group -(\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, -\code{\var{p_his}} for Hispanic/Latino, -\code{\var{p_asi}} for Asian and Pacific Islander, and -\code{\var{p_oth}} for Other/Mixed). +data.frame with probabilities (and flags if requested) } \description{ \code{merge_names} merges names in a user-input dataset with corresponding race/ethnicity probabilities derived from both the U.S. Census Surname List and Spanish Surname List and voter files from states in the Southern U.S. } -\details{ -This function allows users to match names in their dataset with database entries -estimating P(name | ethnicity) for each of the five major racial groups for each -name. The database probabilities are derived from both the U.S. Census Surname List -and Spanish Surname List and voter files from states in the Southern U.S. - -By default, the function matches names as follows: -\enumerate{ -\item Search raw surnames in the database; -\item Remove any punctuation and search again; -\item Remove any spaces and search again; -\item Remove suffixes (e.g., "Jr") and search again (last names only) -\item Split double-barreled names into two parts and search first part of name; -\item Split double-barreled names into two parts and search second part of name; -} - -Each step only applies to names not matched in a previous step. -Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. - -Note: Any name appearing only on the Spanish Surname List is assigned a -probability of 1 for Hispanics/Latinos and 0 for all other racial groups. -} -\examples{ -data(voters) -\dontrun{try(merge_names(voters, namesToUse = "surname", census.surname = TRUE))} -} \keyword{internal} diff --git a/man/modfuns.Rd b/man/modfuns.Rd index 71481a8..e3f5b30 100644 --- a/man/modfuns.Rd +++ b/man/modfuns.Rd @@ -21,6 +21,7 @@ year = "2020", party, retry = 3, + return.unmatched = TRUE, impute.missing = TRUE, use.counties = FALSE ) @@ -37,6 +38,7 @@ predict_race_new( surname.only = FALSE, census.data = NULL, retry = 0, + return.unmatched = TRUE, impute.missing = TRUE, skip_bad_geos = FALSE, census.surname = FALSE, @@ -55,6 +57,7 @@ predict_race_me( surname.only = FALSE, census.data = NULL, retry = 0, + return_missing = TRUE, impute.missing = TRUE, census.surname = FALSE, use.counties = FALSE, @@ -94,6 +97,8 @@ If \code{\link{NULL}}, the default, attempts to find a census key stored in an \item{retry}{See documentation in \code{race_predict}.} +\item{return.unmatched}{See documentation in \code{race_predict}.} + \item{impute.missing}{See documentation in \code{race_predict}.} \item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}?} diff --git a/man/predict_race.Rd b/man/predict_race.Rd index f3bbcd7..6b997fd 100644 --- a/man/predict_race.Rd +++ b/man/predict_race.Rd @@ -16,6 +16,7 @@ predict_race( year = "2020", party = NULL, retry = 3, + return.unmatched = TRUE, impute.missing = TRUE, skip_bad_geos = FALSE, use.counties = FALSE, @@ -105,6 +106,9 @@ it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.} \item{retry}{The number of retries at the census website if network interruption occurs.} +\item{return.unmatched}{Generates Boolean columns for each name reporting +whether a match was made. Default is set to \code{TRUE}.} + \item{impute.missing}{Logical, defaults to TRUE. Should missing be imputed?} \item{skip_bad_geos}{Logical. Option to have the function skip any geolocations that are not present diff --git a/man/wru_data_preflight.Rd b/man/wru_data_preflight.Rd index 7da9e58..8fd6849 100644 --- a/man/wru_data_preflight.Rd +++ b/man/wru_data_preflight.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/merge_names.R \name{wru_data_preflight} \alias{wru_data_preflight} -\title{Preflight for name data} +\title{Preflight for name data (persistent, pinned, and verified) +Downloads 4 name dictionaries to a persistent directory if missing. +Set options(wru_data_wd = TRUE) to cache in getwd(); otherwise uses tempdir().} \usage{ wru_data_preflight() } \description{ -Checks if namedata is available in the current working directory, if not -downloads it from github using piggyback. By default, wru will download the -data to a temporary directory that lasts as long as your session does. -However, you may wish to set the \code{wru_data_wd} option to save the -downloaded data to your current working directory for more permanence. +Preflight for name data (persistent, pinned, and verified) +Downloads 4 name dictionaries to a persistent directory if missing. +Set options(wru_data_wd = TRUE) to cache in getwd(); otherwise uses tempdir(). } diff --git a/tests/testthat/test-census_helper_v2.R b/tests/testthat/test-census_helper_v2.R index 3ea900d..04d75c7 100644 --- a/tests/testthat/test-census_helper_v2.R +++ b/tests/testthat/test-census_helper_v2.R @@ -47,10 +47,10 @@ test_that("helper returns verified census tract data",{ use.counties = FALSE, skip_bad_geos = FALSE ) - expect_equal(x[x$surname == "Lopez", "r_whi"], 0.767197, tolerance = 0.000001) - expect_equal(x[x$surname == "Khanna", "r_whi"], 0.708026, tolerance = 0.000001) - expect_equal(x[x$surname == "Lopez", "r_bla"], 0.09522743, tolerance = 0.000001) - expect_equal(x[x$surname == "Khanna", "r_bla"], 0.09544469, tolerance = 0.000001) + expect_equal(x[x$surname == "Lopez", "r_whi"], 0.5288848, tolerance = 0.000001) + expect_equal(x[x$surname == "Khanna", "r_whi"], 0.5740415, tolerance = 0.000001) + expect_equal(x[x$surname == "Lopez", "r_bla"], 0.06564720, tolerance = 0.000001) + expect_equal(x[x$surname == "Khanna", "r_bla"], 0.07738305, tolerance = 0.000001) }) skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) @@ -89,3 +89,4 @@ test_that("New tables and legacy tables return equal race predictions",{ expect_equal(x$r_asi, y$r_asi, tolerance = .01) # expect_equal(x$r_oth, y$r_oth, tolerance = .01) }) + diff --git a/tests/testthat/test-predict_race_2010.R b/tests/testthat/test-predict_race_2010.R index a48031c..c92777d 100644 --- a/tests/testthat/test-predict_race_2010.R +++ b/tests/testthat/test-predict_race_2010.R @@ -14,7 +14,7 @@ test_that("Tests surname only predictions", { year = 2010, surname.only = TRUE)) # Test and confirm prediction output is as expected - expect_equal(dim(x), c(10, 20)) + expect_equal(dim(x), c(10, 21)) expect_equal(sum(is.na(x)), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.045, tolerance = 0.01) expect_equal(round(x[x$surname == "Johnson", "pred.his"], 4), 0.0272, tolerance = 0.01) @@ -33,7 +33,7 @@ test_that("Test BISG NJ at county level", { )) expect_equal(as.character(x$VoterID), as.character(c(1, 2, 4, 5, 6, 8, 9))) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x)), 0L) expect_equal(sum(x$surname == "Johnson"), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0314, tolerance = 0.01) @@ -82,7 +82,7 @@ test_that("BISG NJ at block level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0L) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.7640, tolerance = 0.01) @@ -108,7 +108,7 @@ test_that("BISG NJ at block_group level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 21)) + expect_equal(dim(x), c(7, 22)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9183, tolerance = 0.01) @@ -173,7 +173,7 @@ test_that("Handles zero-pop. geolocations", { census.data = census, use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.91, tolerance = 0.01) diff --git a/tests/testthat/test-predict_race_2020.R b/tests/testthat/test-predict_race_2020.R index 89f8962..bb3ef19 100644 --- a/tests/testthat/test-predict_race_2020.R +++ b/tests/testthat/test-predict_race_2020.R @@ -25,7 +25,7 @@ test_that("Tests surname only predictions", { voter.file = voters, surname.only = TRUE)) # Test and confirm prediction output is as expected - expect_equal(dim(x), c(10, 20)) + expect_equal(dim(x), c(10, 21)) expect_equal(sum(is.na(x)), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.045, tolerance = 0.01) expect_equal(round(x[x$surname == "Johnson", "pred.his"], 4), 0.0272, tolerance = 0.01) @@ -43,7 +43,7 @@ test_that("Test BISG NJ at county level", { )) expect_equal(as.character(x$VoterID), as.character(c(1, 2, 4, 5, 6, 8, 9))) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x)), 0L) expect_equal(sum(x$surname == "Johnson"), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0181, tolerance = 0.01) @@ -90,7 +90,7 @@ test_that("BISG NJ at block level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0L) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.8078, tolerance = 0.01) @@ -115,7 +115,7 @@ test_that("BISG NJ at block_group level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 21)) + expect_equal(dim(x), c(7, 22)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9374, tolerance = 0.01) @@ -177,7 +177,7 @@ test_that("Handles zero-pop. geolocations", { census.data = census, use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9444, tolerance = 0.01) @@ -205,3 +205,4 @@ test_that("Fixes for issue #68 work as expected", { expect_equal(three$pred.whi[2], 0.8397254) expect_equal(three$pred.whi[3], 0.8397254) }) + diff --git a/tests/testthat/wru-data-census_last_c.rds b/tests/testthat/wru-data-census_last_c.rds new file mode 100644 index 0000000..b926aaf Binary files /dev/null and b/tests/testthat/wru-data-census_last_c.rds differ diff --git a/tests/testthat/wru-data-first_c.rds b/tests/testthat/wru-data-first_c.rds new file mode 100644 index 0000000..3b364a7 Binary files /dev/null and b/tests/testthat/wru-data-first_c.rds differ diff --git a/tests/testthat/wru-data-last_c.rds b/tests/testthat/wru-data-last_c.rds new file mode 100644 index 0000000..57e7bce Binary files /dev/null and b/tests/testthat/wru-data-last_c.rds differ diff --git a/tests/testthat/wru-data-mid_c.rds b/tests/testthat/wru-data-mid_c.rds new file mode 100644 index 0000000..0228974 Binary files /dev/null and b/tests/testthat/wru-data-mid_c.rds differ diff --git a/wru-data-census_last_c.rds b/wru-data-census_last_c.rds new file mode 100644 index 0000000..b926aaf Binary files /dev/null and b/wru-data-census_last_c.rds differ diff --git a/wru-data-first_c.rds b/wru-data-first_c.rds new file mode 100644 index 0000000..3b364a7 Binary files /dev/null and b/wru-data-first_c.rds differ diff --git a/wru-data-last_c.rds b/wru-data-last_c.rds new file mode 100644 index 0000000..57e7bce Binary files /dev/null and b/wru-data-last_c.rds differ diff --git a/wru-data-mid_c.rds b/wru-data-mid_c.rds new file mode 100644 index 0000000..0228974 Binary files /dev/null and b/wru-data-mid_c.rds differ