Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ LazyData: yes
LazyDataCompression: xz
LazyLoad: yes
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@ export(get_census_data)
export(predict_race)
import(PL94171)
importFrom(Rcpp,evalCpp)
importFrom(dplyr,coalesce)
importFrom(dplyr,pull)
importFrom(furrr,future_map_dfr)
importFrom(piggyback,pb_download)
importFrom(purrr,map_dfr)
importFrom(rlang,"%||%")
importFrom(stats,rmultinom)
Expand Down
545 changes: 317 additions & 228 deletions R/merge_names.R

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions R/predict_race.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@
#' Whatever the name of the party registration field in \code{\var{voter.file}},
#' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.
#' @param retry The number of retries at the census website if network interruption occurs.
#' @param return.unmatched Generates Boolean columns for each name reporting
#' whether a match was made. Default is set to \code{TRUE}.
#' @param impute.missing Logical, defaults to TRUE. Should missing be imputed?
#' @param skip_bad_geos Logical. Option to have the function skip any geolocations that are not present
#' in the census data, returning a partial data set. Default is set to \code{FALSE}, in which case it
Expand Down Expand Up @@ -152,6 +154,7 @@ predict_race <- function(
year = "2020",
party = NULL,
retry = 3,
return.unmatched = TRUE,
impute.missing = TRUE,
skip_bad_geos = FALSE,
use.counties = FALSE,
Expand Down Expand Up @@ -223,6 +226,7 @@ predict_race <- function(
surname.only=surname.only,
census.data = census.data,
retry = retry,
return.unmatched = return.unmatched,
impute.missing = impute.missing,
skip_bad_geos = skip_bad_geos,
census.surname = census.surname,
Expand Down Expand Up @@ -253,6 +257,7 @@ predict_race <- function(
surname.only=surname.only,
census.data = census.data,
retry = retry,
return.unmatched = TRUE,
impute.missing = TRUE,
skip_bad_geos = skip_bad_geos,
census.surname = census.surname,
Expand Down
12 changes: 11 additions & 1 deletion R/race_prediction_funs.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#' @param year See documentation in \code{race_predict}.
#' @param party See documentation in \code{race_predict}.
#' @param retry See documentation in \code{race_predict}.
#' @param return.unmatched See documentation in \code{race_predict}.
#' @param impute.missing See documentation in \code{race_predict}.
#' @param skip_bad_geos See documentation in \code{race_predict}.
#' @param names.to.use See documentation in \code{race_predict}.
Expand Down Expand Up @@ -60,6 +61,7 @@ NULL
year = "2020",
party,
retry = 3,
return.unmatched = TRUE,
impute.missing = TRUE,
use.counties = FALSE
) {
Expand Down Expand Up @@ -282,6 +284,7 @@ predict_race_new <- function(
surname.only=FALSE,
census.data = NULL,
retry = 0,
return.unmatched = TRUE,
impute.missing = TRUE,
skip_bad_geos = FALSE,
census.surname = FALSE,
Expand Down Expand Up @@ -397,6 +400,7 @@ predict_race_new <- function(
table.first=name.dictionaries[["first"]],
table.middle=name.dictionaries[["middle"]],
clean.names = TRUE,
return.unmatched = return.unmatched,
impute.missing = impute.missing,
model = 'BISG')

Expand Down Expand Up @@ -425,7 +429,12 @@ predict_race_new <- function(
## Revert to national Pr(Race) for missing predictions
colnames(preds) <- paste("pred", eth, sep = ".")

return(data.frame(cbind(voter.file[c(vars.orig)], preds)))
if(return.unmatched == TRUE) {
return(data.frame(cbind(voter.file, preds)) |>
dplyr::select(dplyr::all_of(vars.orig), dplyr::any_of(c("last", "first", "middle")), dplyr::ends_with("_matched"), -dplyr::starts_with("c_"),-dplyr::ends_with(".match"), dplyr::starts_with("pred.")))
} else {
return(data.frame(cbind(voter.file[c(vars.orig)], preds)))
}
}


Expand All @@ -448,6 +457,7 @@ predict_race_me <- function(
surname.only = FALSE,
census.data = NULL,
retry = 0,
return_missing = TRUE,
impute.missing = TRUE,
census.surname = FALSE,
use.counties = FALSE,
Expand Down
13 changes: 9 additions & 4 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,15 @@ pak::pkg_install("kosukeimai/wru")

Here is a simple example that predicts the race/ethnicity of voters based only on their surnames.

Note: Sample data `voters` is based on 2010 Census, therefore we fix that variable here. Default
is the latest Census release year (2020). See `get_census_data()` to retrieve Census data related to your
data set. If you are getting an error similar to setting `year = 2020`, this is what needs to be
addressed.

``` r
library(wru)
future::plan(future::multisession)
predict_race(voter.file = voters, surname.only = TRUE)
predict_race(voter.file = voters, surname.only = TRUE, year = 2010)
```

The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., `pred.his` is the probability of being Hispanic/Latino):
Expand Down Expand Up @@ -86,7 +91,7 @@ Note that a valid API key must be stored in a `CENSUS_API_KEY` environment varia

``` r
library(wru)
predict_race(voter.file = voters, census.geo = "tract", party = "PID")
predict_race(voter.file = voters, year = 2010, census.geo = "tract", party = "PID")
```
```
VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth
Expand All @@ -106,7 +111,7 @@ In `predict_race()`, the `census.geo` options are "county", "tract", "block" and
Here is an example of prediction based on census statistics collected at the level of "place":

``` r
predict_race(voter.file = voters, census.geo = "place", party = "PID")
predict_race(voter.file = voters, year = 2010, census.geo = "place", party = "PID")
```
```
VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth
Expand Down Expand Up @@ -137,7 +142,7 @@ In this case, predictions are conditioned on age but not sex, so `age = TRUE` an
library(wru)
voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ]
census.dc.nj <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE)
predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID")
predict_race(voter.file = voters.dc.nj, year = 2010, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID")
```

This produces the same result as the following statement, which downloads census data during evaluation rather than using pre-downloaded data:
Expand Down
77 changes: 13 additions & 64 deletions man/merge_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions man/modfuns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions man/predict_race.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 6 additions & 6 deletions man/wru_data_preflight.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions tests/testthat/test-census_helper_v2.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ test_that("helper returns verified census tract data",{
use.counties = FALSE,
skip_bad_geos = FALSE
)
expect_equal(x[x$surname == "Lopez", "r_whi"], 0.767197, tolerance = 0.000001)
expect_equal(x[x$surname == "Khanna", "r_whi"], 0.708026, tolerance = 0.000001)
expect_equal(x[x$surname == "Lopez", "r_bla"], 0.09522743, tolerance = 0.000001)
expect_equal(x[x$surname == "Khanna", "r_bla"], 0.09544469, tolerance = 0.000001)
expect_equal(x[x$surname == "Lopez", "r_whi"], 0.5288848, tolerance = 0.000001)
expect_equal(x[x$surname == "Khanna", "r_whi"], 0.5740415, tolerance = 0.000001)
expect_equal(x[x$surname == "Lopez", "r_bla"], 0.06564720, tolerance = 0.000001)
expect_equal(x[x$surname == "Khanna", "r_bla"], 0.07738305, tolerance = 0.000001)
})

skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY")))
Expand Down Expand Up @@ -89,3 +89,4 @@ test_that("New tables and legacy tables return equal race predictions",{
expect_equal(x$r_asi, y$r_asi, tolerance = .01)
# expect_equal(x$r_oth, y$r_oth, tolerance = .01)
})

10 changes: 5 additions & 5 deletions tests/testthat/test-predict_race_2010.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ test_that("Tests surname only predictions", {
year = 2010,
surname.only = TRUE))
# Test and confirm prediction output is as expected
expect_equal(dim(x), c(10, 20))
expect_equal(dim(x), c(10, 21))
expect_equal(sum(is.na(x)), 0)
expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.045, tolerance = 0.01)
expect_equal(round(x[x$surname == "Johnson", "pred.his"], 4), 0.0272, tolerance = 0.01)
Expand All @@ -33,7 +33,7 @@ test_that("Test BISG NJ at county level", {
))

expect_equal(as.character(x$VoterID), as.character(c(1, 2, 4, 5, 6, 8, 9)))
expect_equal(dim(x), c(7, 20))
expect_equal(dim(x), c(7, 21))
expect_equal(sum(is.na(x)), 0L)
expect_equal(sum(x$surname == "Johnson"), 0)
expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0314, tolerance = 0.01)
Expand Down Expand Up @@ -82,7 +82,7 @@ test_that("BISG NJ at block level", {
use.counties = TRUE)
)

expect_equal(dim(x), c(7, 20))
expect_equal(dim(x), c(7, 21))
expect_equal(sum(is.na(x$pred.asi)), 0L)
expect_true(!any(duplicated(x$surname)))
expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.7640, tolerance = 0.01)
Expand All @@ -108,7 +108,7 @@ test_that("BISG NJ at block_group level", {
use.counties = TRUE)
)

expect_equal(dim(x), c(7, 21))
expect_equal(dim(x), c(7, 22))
expect_equal(sum(is.na(x$pred.asi)), 0)
expect_true(!any(duplicated(x$surname)))
expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9183, tolerance = 0.01)
Expand Down Expand Up @@ -173,7 +173,7 @@ test_that("Handles zero-pop. geolocations", {
census.data = census,
use.counties = TRUE)
)
expect_equal(dim(x), c(7, 20))
expect_equal(dim(x), c(7, 21))
expect_equal(sum(is.na(x$pred.asi)), 0)
expect_true(!any(duplicated(x$surname)))
expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.91, tolerance = 0.01)
Expand Down
Loading