kosukeimai · mdblocker · Oct 10, 2025 · Oct 14, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -55,4 +55,4 @@ LazyData: yes
 LazyDataCompression: xz
 LazyLoad: yes
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,10 +7,8 @@ export(get_census_data)
 export(predict_race)
 import(PL94171)
 importFrom(Rcpp,evalCpp)
-importFrom(dplyr,coalesce)
 importFrom(dplyr,pull)
 importFrom(furrr,future_map_dfr)
-importFrom(piggyback,pb_download)
 importFrom(purrr,map_dfr)
 importFrom(rlang,"%||%")
 importFrom(stats,rmultinom)

diff --git a/R/merge_names.R b/R/merge_names.R
diff --git a/R/predict_race.R b/R/predict_race.R
@@ -74,6 +74,8 @@
 #' Whatever the name of the party registration field in \code{\var{voter.file}},
 #' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.
 #' @param retry The number of retries at the census website if network interruption occurs.
+#' @param return.unmatched Generates Boolean columns for each name reporting 
+#' whether a match was made. Default is set to \code{TRUE}.
 #' @param impute.missing Logical, defaults to TRUE. Should missing be imputed?
 #' @param skip_bad_geos Logical. Option to have the function skip any geolocations that are not present 
 #' in the census data, returning a partial data set. Default is set to \code{FALSE}, in which case it
@@ -152,6 +154,7 @@ predict_race <- function(
     year = "2020",
     party = NULL,
     retry = 3,
+    return.unmatched = TRUE,
     impute.missing = TRUE,
     skip_bad_geos = FALSE,
     use.counties = FALSE,
@@ -223,6 +226,7 @@ predict_race <- function(
                               surname.only=surname.only,
                               census.data = census.data,
                               retry = retry,
+                              return.unmatched = return.unmatched,
                               impute.missing = impute.missing,
                               skip_bad_geos = skip_bad_geos,
                               census.surname = census.surname,
@@ -253,6 +257,7 @@ predict_race <- function(
                                  surname.only=surname.only,
                                  census.data = census.data,
                                  retry = retry,
+                                 return.unmatched = TRUE,
                                  impute.missing = TRUE,
                                  skip_bad_geos = skip_bad_geos,
                                  census.surname = census.surname,

diff --git a/R/race_prediction_funs.R b/R/race_prediction_funs.R
@@ -25,6 +25,7 @@
 #' @param year See documentation in \code{race_predict}.
 #' @param party See documentation in \code{race_predict}.
 #' @param retry See documentation in \code{race_predict}.
+#' @param return.unmatched See documentation in \code{race_predict}.
 #' @param impute.missing See documentation in \code{race_predict}.
 #' @param skip_bad_geos See documentation in \code{race_predict}.
 #' @param names.to.use See documentation in \code{race_predict}.
@@ -60,6 +61,7 @@ NULL
     year = "2020",
     party,
     retry = 3,
+    return.unmatched = TRUE,
     impute.missing = TRUE,
     use.counties = FALSE
 ) {
@@ -282,6 +284,7 @@ predict_race_new <- function(
     surname.only=FALSE,
     census.data = NULL,
     retry = 0,
+    return.unmatched = TRUE,
     impute.missing = TRUE,
     skip_bad_geos = FALSE,
     census.surname = FALSE,
@@ -397,6 +400,7 @@ predict_race_new <- function(
                             table.first=name.dictionaries[["first"]],
                             table.middle=name.dictionaries[["middle"]],
                             clean.names = TRUE,
+                            return.unmatched = return.unmatched,
                             impute.missing = impute.missing,
                             model = 'BISG')
 
@@ -425,7 +429,12 @@ predict_race_new <- function(
   ## Revert to national Pr(Race) for missing predictions
   colnames(preds) <- paste("pred", eth, sep = ".")
 
-  return(data.frame(cbind(voter.file[c(vars.orig)], preds)))
+  if(return.unmatched == TRUE) {
+    return(data.frame(cbind(voter.file, preds)) |> 
+             dplyr::select(dplyr::all_of(vars.orig), dplyr::any_of(c("last", "first", "middle")), dplyr::ends_with("_matched"), -dplyr::starts_with("c_"),-dplyr::ends_with(".match"), dplyr::starts_with("pred.")))
+  } else {
+    return(data.frame(cbind(voter.file[c(vars.orig)], preds)))
+  }
 }
 
 
@@ -448,6 +457,7 @@ predict_race_me <- function(
     surname.only = FALSE,
     census.data = NULL,
     retry = 0,
+    return_missing = TRUE,
     impute.missing = TRUE,
     census.surname = FALSE,
     use.counties = FALSE,

diff --git a/README.Rmd b/README.Rmd
@@ -43,10 +43,15 @@ pak::pkg_install("kosukeimai/wru")
 
 Here is a simple example that predicts the race/ethnicity of voters based only on their surnames.
 
+Note: Sample data `voters` is based on 2010 Census, therefore we fix that variable here. Default 
+is the latest Census release year (2020). See `get_census_data()` to retrieve Census data related to your 
+data set. If you are getting an error similar to setting `year = 2020`, this is what needs to be 
+ addressed. 
+
 ``` r
 library(wru)
 future::plan(future::multisession)
-predict_race(voter.file = voters, surname.only = TRUE)
+predict_race(voter.file = voters, surname.only = TRUE, year = 2010)
 ```
 
 The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., `pred.his` is the probability of being Hispanic/Latino):
@@ -86,7 +91,7 @@ Note that a valid API key must be stored in a `CENSUS_API_KEY` environment varia
 
 ``` r
 library(wru)
-predict_race(voter.file = voters, census.geo = "tract", party = "PID")
+predict_race(voter.file = voters, year = 2010, census.geo = "tract", party = "PID")
 ```
 ```
  VoterID    surname state CD county  tract block age sex party PID place    pred.whi     pred.bla     pred.his   pred.asi    pred.oth
@@ -106,7 +111,7 @@ In `predict_race()`, the `census.geo` options are "county", "tract", "block" and
 Here is an example of prediction based on census statistics collected at the level of "place":
 
 ``` r
-predict_race(voter.file = voters, census.geo = "place", party = "PID")
+predict_race(voter.file = voters, year = 2010, census.geo = "place", party = "PID")
 ```
 ```
  VoterID    surname state CD county  tract block age sex party PID place    pred.whi     pred.bla     pred.his   pred.asi    pred.oth
@@ -137,7 +142,7 @@ In this case, predictions are conditioned on age but not sex, so `age = TRUE` an
 library(wru)
 voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ]
 census.dc.nj <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE)
-predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID")
+predict_race(voter.file = voters.dc.nj, year = 2010, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID")
 ```
 
 This produces the same result as the following statement, which downloads census data during evaluation rather than using pre-downloaded data:

diff --git a/man/merge_names.Rd b/man/merge_names.Rd
diff --git a/man/modfuns.Rd b/man/modfuns.Rd
diff --git a/man/predict_race.Rd b/man/predict_race.Rd
diff --git a/man/wru_data_preflight.Rd b/man/wru_data_preflight.Rd
diff --git a/tests/testthat/test-census_helper_v2.R b/tests/testthat/test-census_helper_v2.R
@@ -47,10 +47,10 @@ test_that("helper returns verified census tract data",{
     use.counties = FALSE,
     skip_bad_geos = FALSE
     )
-  expect_equal(x[x$surname == "Lopez", "r_whi"], 0.767197, tolerance = 0.000001)
-  expect_equal(x[x$surname == "Khanna", "r_whi"], 0.708026, tolerance = 0.000001)
-  expect_equal(x[x$surname == "Lopez", "r_bla"], 0.09522743, tolerance = 0.000001)
-  expect_equal(x[x$surname == "Khanna", "r_bla"], 0.09544469, tolerance = 0.000001)
+  expect_equal(x[x$surname == "Lopez", "r_whi"], 0.5288848, tolerance = 0.000001)
+  expect_equal(x[x$surname == "Khanna", "r_whi"], 0.5740415, tolerance = 0.000001)
+  expect_equal(x[x$surname == "Lopez", "r_bla"], 0.06564720, tolerance = 0.000001)
+  expect_equal(x[x$surname == "Khanna", "r_bla"], 0.07738305, tolerance = 0.000001)
 })
 
 skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY")))
@@ -89,3 +89,4 @@ test_that("New tables and legacy tables return equal race predictions",{
   expect_equal(x$r_asi, y$r_asi, tolerance = .01)
   # expect_equal(x$r_oth, y$r_oth, tolerance = .01)
 })
+
diff --git a/tests/testthat/test-predict_race_2010.R b/tests/testthat/test-predict_race_2010.R
@@ -14,7 +14,7 @@ test_that("Tests surname only predictions", {
     year = 2010,
     surname.only = TRUE))
   # Test and confirm prediction output is as expected
-  expect_equal(dim(x), c(10, 20))
+  expect_equal(dim(x), c(10, 21))
   expect_equal(sum(is.na(x)), 0)
   expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.045, tolerance = 0.01)
   expect_equal(round(x[x$surname == "Johnson", "pred.his"], 4), 0.0272, tolerance = 0.01)
@@ -33,7 +33,7 @@ test_that("Test BISG NJ at county level", {
     ))
 
   expect_equal(as.character(x$VoterID), as.character(c(1, 2, 4, 5, 6, 8, 9)))
-  expect_equal(dim(x), c(7, 20))
+  expect_equal(dim(x), c(7, 21))
   expect_equal(sum(is.na(x)), 0L)
   expect_equal(sum(x$surname == "Johnson"), 0)
   expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0314, tolerance = 0.01)
@@ -82,7 +82,7 @@ test_that("BISG NJ at block level", {
     use.counties = TRUE)
   )
 
-  expect_equal(dim(x), c(7, 20))
+  expect_equal(dim(x), c(7, 21))
   expect_equal(sum(is.na(x$pred.asi)), 0L)
   expect_true(!any(duplicated(x$surname)))
   expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.7640, tolerance = 0.01)
@@ -108,7 +108,7 @@ test_that("BISG NJ at block_group level", {
     use.counties = TRUE)
   )
 
-  expect_equal(dim(x), c(7, 21))
+  expect_equal(dim(x), c(7, 22))
   expect_equal(sum(is.na(x$pred.asi)), 0)
   expect_true(!any(duplicated(x$surname)))
   expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9183, tolerance = 0.01)
@@ -173,7 +173,7 @@ test_that("Handles zero-pop. geolocations", {
     census.data = census, 
     use.counties = TRUE)
   )
-  expect_equal(dim(x), c(7, 20))
+  expect_equal(dim(x), c(7, 21))
   expect_equal(sum(is.na(x$pred.asi)), 0)
   expect_true(!any(duplicated(x$surname)))
   expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.91, tolerance = 0.01)