diff --git a/.Rbuildignore b/.Rbuildignore index 6cc9a29..af6362a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,5 +12,3 @@ ChangeLog ^cran-comments\.md$ ^CRAN-SUBMISSION$ -^README\.Rmd$ -^data-raw$ diff --git a/.gitignore b/.gitignore index 0f2e051..4e2c122 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,3 @@ -.DS_Store - # History files .Rhistory .Rapp.history @@ -10,7 +8,6 @@ # RStudio files .Rproj.user/ .Rproj -.lazytest # produced vignettes vignettes/*.html @@ -24,4 +21,4 @@ vignettes/*.pdf src/RcppExports.o src/aux_funs.o src/sample_me.o -src/wru.so +src/wru.so \ No newline at end of file diff --git a/ChangeLog b/ChangeLog index 40b4298..09492e1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -16,8 +16,3 @@ Date Version Comment 2022-06-17 1.0.0 Updates to BISG, inclusion of fBISG and other package improvements 2022-10-04 1.0.1 Bug fixes for census url and census year 2023-06-12 2.0.0 Updated defaults to 2020 data, specifiy as next major version 2.0. -2024-02-15 3.0.0 Adding back age and sex functionality. Other improvements. -2024-02-27 3.0.1 Github has changed their policy on binary formats in releases, need to refer to older version -2024-04-02 3.0.2 Fixes a bug that led to overestimation of black and hispanic populations (see issue #145) -2024-05-24 3.0.3 Fixes a bug that pushed NaN into small population tracts (see issue #151) -2024-06-07 3.0.4 Fixes a bug related to a list of variables not being unnested properly (issue #151, #153) \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 94dc35c..08c4337 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,58 +1,47 @@ Package: wru -Title: Who are You? Bayesian Prediction of Racial Category Using Surname, - First Name, Middle Name, and Geolocation -Version: 3.0.4 -Date: 2024-06-07 +Version: 2.0.0 +Date: 2023-07-12 +Title: Who are You? Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and + Geolocation Authors@R: c( - person("Kabir", "Khanna", , "kabirkhanna@gmail.com", role = "aut"), - person("Brandon", "Bertelsen", , "brandon@bertelsen.ca", role = c("aut", "cre")), - person("Santiago", "Olivella", , "olivella@unc.edu", role = "aut"), - person("Evan", "Rosenman", , "etrrosenman@gmail.com", role = "aut"), - person("Alexander", "Rossell Hayes", , "alexander@rossellhayes.com", role = "aut"), - person("Kosuke", "Imai", , "imai@harvard.edu", role = "aut") + person("Kabir", "Khanna", email = "kabirkhanna@gmail.com", role = c("aut")), + person("Brandon", "Bertelsen", email = "brandon@bertelsen.ca", role = c("aut","cre")), + person("Santiago", "Olivella", email = "olivella@unc.edu", role = c("aut")), + person("Evan", "Rosenman", email = "etrrosenman@gmail.com", role = c("aut")), + person("Kosuke", "Imai", email = "imai@harvard.edu", role = c("aut")) ) -Description: Predicts individual race/ethnicity using surname, first name, - middle name, geolocation, and other attributes, such as gender and - age. The method utilizes Bayes' Rule (with optional measurement error - correction) to compute the posterior probability of each racial - category for any given individual. The package implements methods - described in Imai and Khanna (2016) "Improving Ecological Inference by - Predicting Individual Ethnicity from Voter Registration Records" - Political Analysis and Imai, Olivella, and - Rosenman (2022) "Addressing census data problems in race imputation - via fully Bayesian Improved Surname Geocoding and name supplements" - . The package also incorporates the data - described in Rosenman, Olivella, and Imai (2023) "Race and ethnicity - data for first, middle, and surnames" - . -License: GPL (>= 3) +Description: Predicts individual race/ethnicity using surname, first name, middle name, geolocation, + and other attributes, such as gender and age. The method utilizes Bayes' + Rule (with optional measurement error correction) to compute the posterior probability of each racial category for any given + individual. The package implements methods described in Imai and Khanna (2016) + "Improving Ecological Inference by Predicting Individual Ethnicity from Voter + Registration Records" Political Analysis and Imai, Olivella, and Rosenman (2022) + "Addressing census data problems in race imputation via fully Bayesian Improved Surname Geocoding and name supplements" + . The package also incorporates the data described in Rosenman, Olivella, and Imai (2023) + "Race and ethnicity data for first, middle, and surnames" . URL: https://github.com/kosukeimai/wru BugReports: https://github.com/kosukeimai/wru/issues Depends: R (>= 4.1.0), utils Imports: - cli, dplyr, - tidyr, furrr, future, - piggyback (>= 0.1.4), - PL94171, purrr, Rcpp, - rlang + piggyback (>= 0.1.4), + PL94171 Suggests: - covr, testthat (>= 3.0.0), - tidycensus + covr LinkingTo: Rcpp, RcppArmadillo -Config/testthat/edition: 3 -Encoding: UTF-8 +LazyLoad: yes LazyData: yes LazyDataCompression: xz -LazyLoad: yes -Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +License: GPL (>= 3) +RoxygenNote: 7.2.3 +Encoding: UTF-8 +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index da53ada..5237a36 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,18 +1,14 @@ # Generated by roxygen2: do not edit by hand -export(as_fips_code) -export(as_state_abbreviation) export(format_legacy_data) export(get_census_data) export(predict_race) import(PL94171) importFrom(Rcpp,evalCpp) importFrom(dplyr,coalesce) -importFrom(dplyr,pull) importFrom(furrr,future_map_dfr) importFrom(piggyback,pb_download) importFrom(purrr,map_dfr) -importFrom(rlang,"%||%") importFrom(stats,rmultinom) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) diff --git a/R/RcppExports.R b/R/RcppExports.R index ba0989f..b5c2b45 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,9 +5,13 @@ #' #' @param last_name Integer vector of last name identifiers for each record (zero indexed; as all that follow). Must match columns numbers in M_rs. #' @param first_name See last_name -#' @param mid_name See last_name +#' @param middle_name See last_name #' @param geo Integer vector of geographic units for each record. Must match column number in N_rg #' @param N_rg Integer matrix of race | geography counts in census (geograpgies in columns). +#' @param M_rs Integer matrix of race | surname counts in dictionary (surnames in columns). +#' @param M_rf Same as `M_rs`, but for first names (can be empty matrix for surname only models). +#' @param M_rm Same as `M_rs`, but for middle names (can be empty matrix for surname, or surname and first name only models). +#' @param alpha Numeric matrix of race | geography prior probabilities. #' @param pi_s Numeric matrix of race | surname prior probabilities. #' @param pi_f Same as `pi_s`, but for first names. #' @param pi_m Same as `pi_s`, but for middle names. @@ -15,6 +19,7 @@ #' @param which_names Integer; 0=surname only. 1=surname + first name. 2= surname, first, and middle names. #' @param samples Integer number of samples to take after (in total) #' @param burnin Integer number of samples to discard as burn-in of Markov chain +#' @param me_race Boolean; should measurement error in race | geography be corrected? #' @param race_init Integer vector of initial race assignments #' @param verbose Boolean; should informative messages be printed? #' diff --git a/R/census_data_preflight.R b/R/census_data_preflight.R index 8ed74d5..067117e 100644 --- a/R/census_data_preflight.R +++ b/R/census_data_preflight.R @@ -1,15 +1,31 @@ #' Preflight census data #' -#' @inheritParams predict_race +#' @param census.data See documentation in \code{race_predict}. +#' @param census.geo See documentation in \code{race_predict}. +#' @param year See documentation in \code{race_predict}. #' @keywords internal census_data_preflight <- function(census.data, census.geo, year) { - vars_ <- unlist(census_geo_api_names(year = year)) - legacy_vars <- unlist(census_geo_api_names_legacy(year = year)) + + if (year != "2020"){ + vars_ <- c( + pop_white = 'P005003', pop_black = 'P005004', + pop_aian = 'P005005', pop_asian = 'P005006', + pop_nhpi = 'P005007', pop_other = 'P005008', + pop_two = 'P005009', pop_hisp = 'P005010' + ) + } else { + vars_ <- c( + pop_white = 'P2_005N', pop_black = 'P2_006N', + pop_aian = 'P2_007N', pop_asian = 'P2_008N', + pop_nhpi = 'P2_009N', pop_other = 'P2_010N', + pop_two = 'P2_011N', pop_hisp = 'P2_002N' + ) + } test <- lapply(census.data, function(x) { nms_to_test <- names(x[[census.geo]]) - all(vars_ %in% nms_to_test) || all(legacy_vars %in% nms_to_test) + all(vars_ %in% nms_to_test) }) missings <- names(test)[!unlist(test)] diff --git a/R/census_geo_api.R b/R/census_geo_api.R index 4676ba5..c1857b1 100644 --- a/R/census_geo_api.R +++ b/R/census_geo_api.R @@ -5,11 +5,12 @@ #' This function allows users to download U.S. Census geographic data (2010 or 2020), #' at either the county, tract, block, or place level, for a particular state. #' -#' @inheritParams get_census_data +#' @param key A required character object. Must contain user's Census API +#' key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. #' @param state A required character object specifying which state to extract Census data for, #' e.g., \code{"NJ"}. #' @param geo A character object specifying what aggregation level to use. -#' Use `"block"`, `"block_group"`, `"county"`, `"place"`, `"tract"`, or `"zcta"`. +#' Use \code{"county"}, \code{"tract"},\code{"block_group"}, \code{"block"}, or \code{"place"}. #' Default is \code{"tract"}. Warning: extracting block-level data takes very long. #' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on #' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). @@ -34,53 +35,108 @@ #' #' @examples #' \dontshow{data(voters)} -#' \dontrun{census_geo_api(states = c("NJ", "DE"), geo = "block")} -#' \dontrun{census_geo_api(states = "FL", geo = "tract", age = TRUE, sex = TRUE)} -#' \dontrun{census_geo_api(states = "MA", geo = "place", age = FALSE, sex = FALSE, +#' \dontrun{census_geo_api(key = "...", states = c("NJ", "DE"), geo = "block")} +#' \dontrun{census_geo_api(key = "...", states = "FL", geo = "tract", age = TRUE, sex = TRUE)} +#' \dontrun{census_geo_api(key = "...", states = "MA", geo = "place", age = FALSE, sex = FALSE, #' year = "2020")} #' #' @references -#' Relies on `get_census_api()`, `get_census_api_2()`, and `vec_to_chunk()` functions authored by Nicholas Nagle, -#' available [here](https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html). +#' Relies on get_census_api, get_census_api_2, and vec_to_chunk functions authored by Nicholas Nagle, +#' available \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. #' #' @importFrom furrr future_map_dfr #' @importFrom purrr map_dfr #' @keywords internal -census_geo_api <- function( - key = Sys.getenv("CENSUS_API_KEY"), - state, - geo = c("tract", "block", "block_group", "county", "place", "zcta"), - age = FALSE, - sex = FALSE, - year = c("2020", "2010"), - retry = 3, - save_temp = NULL, - counties = NULL -) { - key <- validate_key(key) - - geo <- tolower(geo) - geo <- rlang::arg_match(geo) - - year <- as.character(year) - year <- rlang::arg_match(year) +census_geo_api <- function(key = NULL, state, geo = "tract", age = FALSE, sex = FALSE, year = "2020", retry = 3, save_temp = NULL, counties = NULL) { + + if (missing(key)) { + stop('Must enter U.S. Census API key, which can be requested at https://api.census.gov/data/key_signup.html.') + } census <- NULL - state <- as_state_abbreviation(state) + state <- toupper(state) df.out <- NULL - state.fips <- as_fips_code(state) + # Building fips table (previously loaded via .rda) + fips.codes <- structure(list(State = structure(1:55, levels = c("AK", "AL", + "AR", "AS", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "GU", + "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", + "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", + "NV", "NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", + "TX", "UT", "VA", "VI", "VT", "WA", "WI", "WV", "WY"), class = "factor"), + FIPS = c(2L, 1L, 5L, 60L, 4L, 6L, 8L, 9L, 11L, 10L, 12L, + 13L, 66L, 15L, 19L, 16L, 17L, 18L, 20L, 21L, 22L, 25L, 24L, + 23L, 26L, 27L, 29L, 28L, 30L, 37L, 38L, 31L, 33L, 34L, 35L, + 32L, 36L, 39L, 40L, 41L, 42L, 72L, 44L, 45L, 46L, 47L, 48L, + 49L, 51L, 78L, 50L, 53L, 55L, 54L, 56L)), class = "data.frame", row.names = c(NA, + -55L)) + state.fips <- fips.codes[fips.codes$State == state, "FIPS"] + state.fips <- ifelse(nchar(state.fips) == 1, paste0("0", state.fips), state.fips) + + # if (age == F & sex == F) { + # num <- ifelse(3:10 != 10, paste("0", 3:10, sep = ""), "10") + # vars <- paste("P0050", num, sep = "") + # } + + # assign variable values based on the year of the census data + if (as.character(year) != "2020"){ + vars_ <- c( + pop_white = 'P005003', pop_black = 'P005004', + pop_aian = 'P005005', pop_asian = 'P005006', + pop_nhpi = 'P005007', pop_other = 'P005008', + pop_two = 'P005009', pop_hisp = 'P005010' + ) + } else { + vars_ <- c( + pop_white = 'P2_005N', pop_black = 'P2_006N', + pop_aian = 'P2_007N', pop_asian = 'P2_008N', + pop_nhpi = 'P2_009N', pop_other = 'P2_010N', + pop_two = 'P2_011N', pop_hisp = 'P2_002N' + ) + } - vars <- census_geo_api_names(year = year, age = age, sex = sex) - census_data_url <- census_geo_api_url(year = year) + if (age == F & sex == T) { + eth.let <- c("I", "B", "H", "D", "E", "F", "C") + num <- as.character(c("01", "02", "26")) + for (e in 1:length(eth.let)) { + vars_ <- c(vars_, paste("P012", eth.let[e], "0", num, sep = "")) + } + } + + if (age == T & sex == F) { + eth.let <- c("I", "B", "H", "D", "E", "F", "C") + num <- as.character(c(c("01", "03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + for (e in 1:length(eth.let)) { + vars_ <- c(vars_, paste("P012", eth.let[e], "0", num, sep = "")) + } + } + + if (age == T & sex == T) { + eth.let <- c("I", "B", "H", "D", "E", "F", "C") + num <- as.character(c(c("01", "03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + for (e in 1:length(eth.let)) { + vars_ <- c(vars_, paste("P012", eth.let[e], "0", num, sep = "")) + } + } + + # set the census data url links + if (as.character(year) != "2020") { + census_data_url = "https://api.census.gov/data/2010/dec/sf1?" + } + else { + census_data_url = "https://api.census.gov/data/2020/dec/pl?" + } if (geo == "place") { + geo.merge <- c("state", "place") region <- paste("for=place:*&in=state:", state.fips, sep = "") - census <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region, retry) + census <- get_census_api(census_data_url, key = key, var.names = vars_, region = region, retry) } if (geo == "county") { + geo.merge <- c("state", "county") + if (is.null(counties)) { region <- paste("for=county:*&in=state:", state.fips, sep = "") } else { @@ -88,10 +144,13 @@ census_geo_api <- function( region <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "") } - census <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region, retry) + census <- get_census_api(census_data_url, key = key, var.names = vars_, region = region, retry) } if (geo == "tract") { + + geo.merge <- c("state", "county", "tract") + if (is.null(counties)) { region_county <- paste("for=county:*&in=state:", state.fips, sep = "") } else { @@ -99,7 +158,7 @@ census_geo_api <- function( region_county <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "") } - county_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_county, retry) + county_df <- get_census_api(census_data_url, key = key, var.names = vars_, region = region_county, retry) if(is.null(counties)) { county_list <- county_df$county @@ -108,17 +167,22 @@ census_geo_api <- function( } if(length(county_list) > 0) { - census <- furrr::future_map_dfr(seq_along(county_list), function(county) { + census_tracts <- furrr::future_map_dfr(seq_along(county_list), function(county) { message(paste("County ", county, " of ", length(county_list), ": ", county_list[county], sep = "")) region_county <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[county], sep = "") - get_census_api(data_url = census_data_url, key = key, var.names = unlist(vars), region = region_county, retry) - }, .progress = TRUE) + get_census_api(data_url = census_data_url, key = key, var.names = vars_, region = region_county, retry) + }) + + census <- rbind(census, census_tracts) + rm(census_tracts) } else { message('There were no intersecting counties in your voter.file data (tract)') } } if (geo == "block_group") { + geo.merge <- c("state", "county", "tract", "block_group") + if (is.null(counties)) { region_county <- paste("for=county:*&in=state:", state.fips, sep = "") } else { @@ -126,7 +190,7 @@ census_geo_api <- function( region_county <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "") } - county_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_county, retry) + county_df <- get_census_api(census_data_url, key = key, var.names = vars_, region = region_county, retry) if(is.null(counties)) { county_list <- county_df$county @@ -137,7 +201,7 @@ census_geo_api <- function( if(length(county_list) > 0) { message('Running block_group by county...') - census <- purrr::map_dfr( + census_blockgroup <- purrr::map_dfr( 1:length(county_list), function(county) { # too verbose, commenting out @@ -146,18 +210,24 @@ census_geo_api <- function( blockgroup <- paste("for=block+group:*&in=state:", state.fips, "+county:", county_list[county], sep = "") # message(region_tract) - blockgroup_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = blockgroup, retry) + blockgroup_df <- get_census_api(census_data_url, key = key, var.names = vars_, region = blockgroup, retry) names(blockgroup_df)[4] <- "block_group" # Fix name, it comes in with a space from api. blockgroup_df } ) message("\n") # new line for progress bar + + census <- rbind(census, census_blockgroup) + rm(census_blockgroup) } else { message('There were no intersecting counties in your voter.file data (block)') } } if (geo == "block") { + + geo.merge <- c("state", "county", "tract", "block") + if (is.null(counties)) { region_county <- paste("for=county:*&in=state:", state.fips, sep = "") } else { @@ -165,7 +235,7 @@ census_geo_api <- function( region_county <- paste("for=county:",counties_paste,"&in=state:", state.fips, sep = "") } - county_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_county, retry) + county_df <- get_census_api(census_data_url, key = key, var.names = vars_, region = region_county, retry) if(is.null(counties)) { county_list <- county_df$county @@ -176,7 +246,7 @@ census_geo_api <- function( if(length(county_list) > 0) { message('Running block by county...') - census <- purrr::map_dfr( + census_blocks <- purrr::map_dfr( 1:length(county_list), function(county) { # too verbose, commenting out @@ -184,55 +254,117 @@ census_geo_api <- function( region_tract <- paste("for=tract:*&in=state:", state.fips, "+county:", county_list[county], sep = "") # message(region_tract) - tract_df <- get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_tract, retry) + tract_df <- get_census_api(census_data_url, key = key, var.names = vars_, region = region_tract, retry) tract_list <- tract_df$tract furrr::future_map_dfr(1:length(tract_list), function(tract) { message(paste("Tract ", tract, " of ", length(tract_list), ": ", tract_list[tract], sep = "")) region_block <- paste("for=block:*&in=state:", state.fips, "+county:", county_list[county], "+tract:", tract_list[tract], sep = "") - get_census_api(census_data_url, key = key, var.names = unlist(vars), region = region_block, retry) - }, .progress = TRUE) + get_census_api(census_data_url, key = key, var.names = vars_, region = region_block, retry) + }) } ) message("\n") # new line for progress bar + + census <- rbind(census, census_blocks) + rm(census_blocks) } else { message('There were no intersecting counties in your voter.file data (block)') } } - if (geo == "zcta") { - census <- census_geo_api_zcta( - census_data_url = census_data_url, - key = key, - vars = vars, - state = state, - counties = counties, - retry = retry - ) - } + census$state <- state - census <- dplyr::mutate(census, state = as_state_abbreviation(state)) + if (age == F & sex == F) { + + ## Calculate Pr(Geolocation | Race) + census$r_whi <- census[, vars_["pop_white"]] / sum(census[, vars_["pop_white"]]) #Pr(Geo|White) + census$r_bla <- census[, vars_["pop_black"]] / sum(census[, vars_["pop_black"]]) #Pr(Geo|Black) + census$r_his <- census[, vars_["pop_hisp"]] / sum(census[, vars_["pop_hisp"]]) #Pr(Geo|Latino) + census$r_asi <- (census[, vars_["pop_asian"]] + census[, vars_["pop_nhpi"]]) / (sum(census[, vars_["pop_asian"]]) + sum(census[, vars_["pop_nhpi"]])) #Pr(Geo | Asian or NH/PI) + census$r_oth <- (census[, vars_["pop_aian"]] + census[, vars_["pop_other"]] + census[, vars_["pop_two"]]) / (sum(census[, vars_["pop_aian"]]) + sum(census[, vars_["pop_other"]]) + sum(census[, vars_["pop_two"]])) #Pr(Geo | AI/AN, Other, or Mixed) + + } - r_columns <- purrr::map(vars, function(vars) rowSums(census[vars])) + if (age == F & sex == T) { + + ## Calculate Pr(Geolocation, Sex | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + + for (i in 1:length(eth.cen)) { + if (i != 4 & i != 5) { + census[paste("r_mal", eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "002", sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + census[paste("r_fem", eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "026", sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r_mal", eth.cen[i], sep = "_")] <- (census$P012D002 + census$P012E002) / sum(census$P012D001 + census$P012E001) + census[paste("r_fem", eth.cen[i], sep = "_")] <- (census$P012D026 + census$P012E026) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r_mal", eth.cen[i], sep = "_")] <- (census$P012C002 + census$P012F002) / sum(census$P012C001 + census$P012F001) + census[paste("r_fem", eth.cen[i], sep = "_")] <- (census$P012C026 + census$P012F026) / sum(census$P012C001 + census$P012F001) + } + } + } - census <- dplyr::bind_cols(census, r_columns) - census <- dplyr::group_by(census, dplyr::across(dplyr::any_of("state"))) - census <- dplyr::mutate( - census, - dplyr::across( - # Divide all r_columns by the total population of the corresponding race - dplyr::all_of(names(r_columns)), - function(x) { - x / sum( - dplyr::pick( - sub("^.+_(.{3})$", "r_\\1", dplyr::cur_column(), perl = TRUE) - ) - ) + if (age == T & sex == F) { + + ## Calculate Pr(Geolocation, Age Category | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + age.cat <- c(seq(1, 23), seq(1, 23)) + age.cen <- as.character(c(c("03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + + for (i in 1:length(eth.cen)) { + for (j in 1:23) { + if (i != 4 & i != 5) { + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] + census[paste("P012", eth.let[i], "0", age.cen[j + 23], sep = "")]) / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012D0", age.cen[j + 23], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j + 23], sep = "")]) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r", age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012C0", age.cen[j + 23], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j + 23], sep = "")]) / sum(census$P012C001 + census$P012F001) + } } - ) - ) - census <- dplyr::ungroup(census) + } + } - census + if (age == T & sex == T) { + + ## Calculate Pr(Geolocation, Sex, Age Category | Race) + eth.cen <- c("whi", "bla", "his", "asi", "oth") + eth.let <- c("I", "B", "H", "D", "F") + sex.let <- c("mal", "fem") + age.cat <- c(seq(1, 23), seq(1, 23)) + age.cen <- as.character(c(c("03", "04", "05", "06", "07", "08", "09"), seq(10, 25), seq(27, 49))) + + for (i in 1:length(eth.cen)) { + for (k in 1:length(sex.let)) { + for (j in 1:23) { + if (k == 2) { + j <- j + 23 + } + if (i != 4 & i != 5) { + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- census[paste("P012", eth.let[i], "0", age.cen[j], sep = "")] / sum(census[paste("P012", eth.let[i], "001", sep = "")]) + } + if (i == 4) { + ## Combine Asian and Native Hawaiian/Pacific Islander + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012D0", age.cen[j], sep = "")] + census[paste("P012E0", age.cen[j], sep = "")]) / sum(census$P012D001 + census$P012E001) + } + if (i == 5) { + ## Combine American India/Alaska Native and Other + census[paste("r", sex.let[k], age.cat[j], eth.cen[i], sep = "_")] <- (census[paste("P012C0", age.cen[j], sep = "")] + census[paste("P012F0", age.cen[j], sep = "")]) / sum(census$P012C001 + census$P012F001) + } + } + } + } + } + return(census) } diff --git a/R/census_geo_api_names.R b/R/census_geo_api_names.R deleted file mode 100644 index 0abc53e..0000000 --- a/R/census_geo_api_names.R +++ /dev/null @@ -1,181 +0,0 @@ -# @staticimports pkg:stringstatic -# str_pad - -#' Census geo API helper functions -#' -#' @inheritParams census_geo_api -#' -#' @return -#' \describe{ -#' \item{`census_geo_api_names()`}{ -#' A named list of [character] vectors whose values correspond to columns -#' of a Census API table and whose names represent the new columns they are -#' used to calculate in [census_geo_api()]. -#' } -#' \item{`census_geo_api_url()`}{ -#' A [character] string containing the base of the URL to a -#' Census API table. -#' } -#' } -#' @keywords internal -census_geo_api_names <- function( - year = c("2020", "2010", "2000"), - age = FALSE, - sex = FALSE -) { - year <- as.character(year) - year <- rlang::arg_match(year) - - assert_boolean(age) - assert_boolean(sex) - - if (year == "2020") { - prefix <- "P12" - separator <- "_" - suffix <- "N" - sex_codes <- c("_mal" = 2, "_fem" = 26) - age_codes <- 1:23 - names(age_codes) <- paste0("_", age_codes) - } else if (year %in% c("2010", "2000")) { - prefix <- "PCT012" - separator <- "" - suffix <- "" - sex_codes <- c("_mal" = 2, "_fem" = 106) - age_codes <- list( - "_1" = 1:5, - "_2" = 6:10, - "_3" = 11:15, - "_4" = 16:18, - "_5" = 19:20, - "_6" = 21, - "_7" = 22, - "_8" = 23:25, - "_9" = 26:30, - "_10" = 31:35, - "_11" = 36:40, - "_12" = 41:45, - "_13" = 46:50, - "_14" = 51:55, - "_15" = 56:60, - "_16" = 61:62, - "_17" = 63:65, - "_18" = 66:67, - "_19" = 68:70, - "_20" = 71:75, - "_21" = 76:80, - "_22" = 81:85, - "_23" = 86:103 - ) - } - - race_codes <- list( - "_whi" = "I", - "_bla" = "J", - "_his" = "H", - "_asi" = c("L", "M"), - "_oth" = c("K", "N", "O") - ) - - numeric_codes <- if (age && sex) { - age_sex_codes <- purrr::imap( - sex_codes, - function(sex_code, name) { - codes <- purrr::map( - age_codes, - function(age_code) { - str_pad(age_code + sex_code, 3, "left", pad = "0") - } - ) - names(codes) <- paste0(name, names(codes)) - codes - } - ) - - do.call(c, unname(age_sex_codes)) - } else if (age) { - purrr::map( - age_codes, - function(age_code) { - unlist( - purrr::map( - sex_codes, - function(sex_code) { - str_pad(age_code + sex_code, 3, "left", pad = "0") - } - ) - ) - } - ) - } else if (sex) { - sex_codes[] <- str_pad(sex_codes, 3, "left", pad = "0") - as.list(sex_codes) - } - - numeric_codes <- c("001", numeric_codes) - - combinations <- expand.grid( - prefix = prefix, - race_codes = race_codes, - separator = separator, - numeric_codes = numeric_codes, - suffix = suffix, - KEEP.OUT.ATTRS = FALSE, - stringsAsFactors = FALSE - ) - - vars <- purrr::pmap( - combinations, - function(prefix, race_codes, separator, numeric_codes, suffix) { - inner_combinations <- expand.grid( - prefix = prefix, - race_codes = race_codes, - separator = separator, - numeric_codes = numeric_codes, - suffix = suffix, - KEEP.OUT.ATTRS = FALSE, - stringsAsFactors = FALSE - ) - - apply(inner_combinations, 1, paste, collapse = "") - } - ) - - names(vars) <- paste0( - "r", - names(combinations$numeric_codes), - names(combinations$race_codes) - ) - - vars -} - -census_geo_api_names_legacy <- function(year) { - if (year == 2020) { - return( - list( - r_whi = 'P2_005N', - r_bla = 'P2_006N', - r_his = 'P2_002N', - r_asi = c('P2_008N', 'P2_009N'), - r_oth = c('P2_007N', 'P2_010N', 'P2_011N') - ) - ) - } - - list( - r_whi = 'P005003', - r_bla = 'P005004', - r_his = 'P005010', - r_asi = c('P005006', 'P005007'), - r_oth = c('P005005', 'P005008', 'P005009') - ) -} - -#' @rdname census_geo_api_names -census_geo_api_url <- function(year = c("2020", "2010", "2000")) { - year <- as.character(year) - year <- rlang::arg_match(year) - - if (year == "2020") return("https://api.census.gov/data/2020/dec/dhc?") - paste0("https://api.census.gov/data/", year, "/dec/sf1?") -} \ No newline at end of file diff --git a/R/census_geo_api_zcta.R b/R/census_geo_api_zcta.R deleted file mode 100644 index a82dce1..0000000 --- a/R/census_geo_api_zcta.R +++ /dev/null @@ -1,32 +0,0 @@ -census_geo_api_zcta <- function( - census_data_url, - key, - vars, - state, - counties, - retry -) { - if (!is.null(counties)) { - cli::cli_abort( - '{.arg counties} must be {.code NULL} when {.code geo = "zcta"}, - because ZCTA-level census data split by county is not available.' - ) - } - - region <- paste0( - "for=zip%20code%20tabulation%20area%20(or%20part):*&in=state:", - paste(as_fips_code(state), collapse = ",") - ) - - census <- get_census_api( - census_data_url, - key = key, - var.names = unlist(vars), - region = region, - retry - ) - - names(census)[[2]] <- "zcta" - - census -} diff --git a/R/census_helper.R b/R/census_helper.R index 9068663..c1aee34 100644 --- a/R/census_helper.R +++ b/R/census_helper.R @@ -7,7 +7,8 @@ #' at the county, tract, block, or place level. Census data calculated are #' Pr(Geolocation | Race) where geolocation is county, tract, block, or place. #' -#' @inheritParams get_census_data +#' @param key A required character object. Must contain user's Census API +#' key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. #' @param voter.file An object of class \code{data.frame}. Must contain field(s) named #' \code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}} #' specifying geolocation. These should be character variables that match up with @@ -53,35 +54,24 @@ #' data(voters) #' } #' \dontrun{ -#' census_helper(voter.file = voters, states = "nj", geo = "block") +#' census_helper(key = "...", voter.file = voters, states = "nj", geo = "block") #' } #' \dontrun{ #' census_helper( -#' voter.file = voters, states = "all", geo = "tract", +#' key = "...", voter.file = voters, states = "all", geo = "tract", #' age = TRUE, sex = TRUE #' ) #' } #' \dontrun{ #' census_helper( -#' voter.file = voters, states = "all", geo = "county", +#' key = "...", voter.file = voters, states = "all", geo = "county", #' age = FALSE, sex = FALSE, year = "2020" #' ) #' } #' #' @keywords internal -census_helper <- function( - key = Sys.getenv("CENSUS_API_KEY"), - voter.file, - states = "all", - geo = "tract", - age = FALSE, - sex = FALSE, - year = "2020", - census.data = NULL, - retry = 3, - use.counties = FALSE -) { +census_helper <- function(key, voter.file, states = "all", geo = "tract", age = FALSE, sex = FALSE, year = "2020", census.data = NULL, retry = 3, use.counties = FALSE) { if (is.null(census.data) || (typeof(census.data) != "list")) { toDownload <- TRUE } else { @@ -89,7 +79,9 @@ census_helper <- function( } if (toDownload) { - key <- validate_key(key) + if (missing(key)) { + stop("Must enter U.S. Census API key, which can be requested at https://api.census.gov/data/key_signup.html.") + } } states <- toupper(states) diff --git a/R/census_helper_v2.R b/R/census_helper_v2.R index fad323b..6761443 100644 --- a/R/census_helper_v2.R +++ b/R/census_helper_v2.R @@ -8,7 +8,8 @@ #' at the county, tract, block, or place level. Census data calculated are #' Pr(Geolocation | Race) where geolocation is county, tract, block, or place. #' -#' @inheritParams get_census_data +#' @param key A required character object. Must contain user's Census API +#' key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. #' @param voter.file An object of class \code{data.frame}. Must contain field(s) named #' \code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}} #' specifying geolocation. These should be character variables that match up with @@ -42,46 +43,31 @@ #' @param retry The number of retries at the census website if network interruption occurs. #' @param use.counties A logical, defaulting to FALSE. Should census data be filtered by counties #' available in \var{census.data}? -#' @param skip_bad_geos Logical. Option to have the function skip any geolocations that are not present -#' in the census data, returning a partial data set. Default is set to \code{FALSE}, which case it will -#' break and provide error message with a list of offending geolocations. #' @return Output will be an object of class \code{data.frame}. It will #' consist of the original user-input data with additional columns of #' Census data. #' #' @examples #' \dontshow{data(voters)} -#' \dontrun{census_helper_new(voter.file = voters, states = "nj", geo = "block")} -#' \dontrun{census_helper_new(voter.file = voters, states = "all", geo = "tract")} -#' \dontrun{census_helper_new(voter.file = voters, states = "all", geo = "place", +#' \dontrun{census_helper_new(key = "...", voter.file = voters, states = "nj", geo = "block")} +#' \dontrun{census_helper_new(key = "...", voter.file = voters, states = "all", geo = "tract")} +#' \dontrun{census_helper_new(key = "...", voter.file = voters, states = "all", geo = "place", #' year = "2020")} #' #' @keywords internal -census_helper_new <- function( - key = Sys.getenv("CENSUS_API_KEY"), - voter.file, - states = "all", - geo = c("tract", "block", "block_group", "county", "place", "zcta"), - age = FALSE, - sex = FALSE, - year = "2020", - census.data = NULL, - retry = 3, - use.counties = FALSE, - skip_bad_geos = FALSE -) { +census_helper_new <- function(key, voter.file, states = "all", geo = "tract", age = FALSE, sex = FALSE, year = "2020", census.data = NULL, retry = 3, use.counties = FALSE) { - if ("precinct" %in% geo) { + if (geo == "precinct") { stop("Error: census_helper_new function does not currently support precinct-level data.") } - geo <- tolower(geo) - geo <- rlang::arg_match(geo) - if(!(year %in% c("2000","2010","2020"))){ stop("Interface only implemented for census years '2000', '2010', or '2020'.") } + if (any(age, sex)){ + stop("Models using age and sex not currently implemented.") + } if (is.null(census.data) || (typeof(census.data) != "list")) { toDownload = TRUE @@ -90,13 +76,15 @@ census_helper_new <- function( } if (toDownload) { - key <- validate_key(key) + if (missing(key)) { + stop('Must enter U.S. Census API key, which can be requested at https://api.census.gov/data/key_signup.html.') + } } - if (toupper(states) == "ALL") { + states <- toupper(states) + if (states == "ALL") { states <- toupper(as.character(unique(voter.file$state))) } - states <- as_state_abbreviation(states) df.out <- NULL @@ -105,6 +93,30 @@ census_helper_new <- function( message(paste("State ", s, " of ", length(states), ": ", states[s], sep = "")) state <- toupper(states[s]) + if (geo == "place") { + geo.merge <- c("place") + if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$year != year) || (census.data[[state]]$age != FALSE) || (census.data[[state]]$sex != FALSE)) { + #} || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { + if(use.counties) { + census <- census_geo_api(key, state, geo = "place", age, sex, retry) + } else { + census <- census_geo_api(key, state, geo = "place", age, sex, retry) + } + } else { + census <- census.data[[toupper(state)]]$place + } + } + + if (geo == "county") { + geo.merge <- c("county") + if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$year != year) || (census.data[[state]]$age != FALSE) || (census.data[[state]]$sex != FALSE)) { + #} || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { + census <- census_geo_api(key, state, geo = "county", age, sex, retry) + } else { + census <- census.data[[toupper(state)]]$county + } + } + if (geo == "tract") { geo.merge <- c("county", "tract") if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$year != year) || (census.data[[state]]$age != FALSE) || (census.data[[state]]$sex != FALSE)) {#} || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { @@ -113,12 +125,14 @@ census_helper_new <- function( # Only those counties within the target state counties = unique(voter.file$county[voter.file$state == state])) } else { - census <- census_geo_api(key, state, geo = "tract", age, sex, year, retry) + census <- census_geo_api(key, state, geo = "tract", age, sex, retry) } } else { census <- census.data[[toupper(state)]]$tract } - } else if (geo == "block_group") { + } + + if (geo == "block_group") { geo.merge <- c("county", "tract", "block_group") if ((toDownload) || (is.null(census.data[[state]])) || (census.data[[state]]$year != year) || (census.data[[state]]$age != FALSE) || (census.data[[state]]$sex != FALSE)) {#} || (census.data[[state]]$age != age) || (census.data[[state]]$sex != sex)) { if(use.counties) { @@ -126,13 +140,16 @@ census_helper_new <- function( # Only those counties within the target state counties = unique(voter.file$county[voter.file$state == state])) } else { - census <- census_geo_api(key, state, geo = "block_group", age, sex, year, retry) + census <- census_geo_api(key, state, geo = "block_group", age, sex, retry) } } else { census <- census.data[[toupper(state)]]$block_group } - } else if (geo == "block") { + } + + + if (geo == "block") { if(any(names(census.data) == "block_group")) { geo.merge <- c("county", "tract", "block_group", "block") } else { @@ -145,57 +162,49 @@ census_helper_new <- function( # Only those counties within the target state counties = unique(voter.file$county[voter.file$state == state])) } else { - census <- census_geo_api(key, state, geo = "block", age, sex, year, retry) + census <- census_geo_api(key, state, geo = "block", age, sex, retry) } } else { census <- census.data[[toupper(state)]]$block } - } else { - geo.merge <- geo - - state_must_be_downloaded <- toDownload || - is.null(census.data[[state]]) || - census.data[[state]]$year != year || - # TODO: Why do we always redownload if sex or age == TRUE? - census.data[[state]]$age != FALSE || - census.data[[state]]$sex != FALSE - - if (state_must_be_downloaded) { - census <- census_geo_api(key, state, geo = geo, age, sex, year, retry) - } else { - census <- census.data[[state]][[geo]] - } } census$state <- state + ## Calculate Pr(Geolocation | Race) - if (any(c("P2_005N", "P005003") %in% names(census))) { - message(sprintf("NOTE: Legacy column names detected, loading Race values from Census Redistricting table for %s. Age, Sex, and ZCTA predictions will be unavailable.", year)) - # TODO: Add test that we get the same ratios with legacy and new tables for 2020 - # Old table: Redistricting (Pl-some numbers) (does not have age, sex, or ZCTAs) - # New table: DHC (does have age, sex, and ZCTA) - vars_ <- census_geo_api_names_legacy(year = year) + if (year != "2020") { + vars_ <- c( + pop_white = 'P005003', pop_black = 'P005004', + pop_aian = 'P005005', pop_asian = 'P005006', + pop_nhpi = 'P005007', pop_other = 'P005008', + pop_two = 'P005009', pop_hisp = 'P005010' + ) + drop <- c(grep("state", names(census)), grep("P005", names(census))) } else { - vars_ <- census_geo_api_names(year) + vars_ <- c( + pop_white = 'P2_005N', pop_black = 'P2_006N', + pop_aian = 'P2_007N', pop_asian = 'P2_008N', + pop_nhpi = 'P2_009N', pop_other = 'P2_010N', + pop_two = 'P2_011N', pop_hisp = 'P2_002N' + ) + drop <- c(grep("state", names(census)), grep("P2_", names(census))) } - drop <- match(c("state", unlist(vars_)), names(census)) + geoPopulations <- rowSums(census[,names(census) %in% vars_]) - geoPopulations <- rowSums(census[,names(census) %in% unlist(vars_)]) - - census$r_whi <- rowSums(census[, vars_[["r_whi"]], drop = FALSE]) / (geoPopulations) #Pr(White | Geo) - census$r_bla <- rowSums(census[, vars_[["r_bla"]], drop = FALSE]) / (geoPopulations) #Pr(Black | Geo) - census$r_his <- rowSums(census[, vars_[["r_his"]], drop = FALSE]) / (geoPopulations) #Pr(Latino | Geo) - census$r_asi <- rowSums(census[, vars_[["r_asi"]], drop = FALSE]) / (geoPopulations) #Pr(Asian or NH/PI | Geo) - census$r_oth <- rowSums(census[, vars_[["r_oth"]], drop = FALSE]) / (geoPopulations) #Pr(AI/AN, Other, or Mixed | Geo) + census$r_whi <- (census[, vars_["pop_white"]]) / (geoPopulations ) #Pr(White | Geo) + census$r_bla <- (census[, vars_["pop_black"]]) / (geoPopulations) #Pr(Black | Geo) + census$r_his <- (census[, vars_["pop_hisp"]]) / (geoPopulations) #Pr(Latino | Geo) + census$r_asi <- (census[, vars_["pop_asian"]] + census[, vars_["pop_nhpi"]]) / (geoPopulations) #Pr(Asian or NH/PI | Geo) + census$r_oth <- (census[, vars_["pop_aian"]] + census[, vars_["pop_other"]] + census[, vars_["pop_two"]]) / (geoPopulations) #Pr(AI/AN, Other, or Mixed | Geo) # check locations with zero people # get average without places with zero people, and assign that to zero locs. - zero_ind <- which((geoPopulations - 0.0) < .Machine$double.eps) - if (length(zero_ind)) { - for (rcat in c("r_whi","r_bla","r_his","r_asi","r_oth")) { - census[[rcat]][zero_ind] <- mean(census[[rcat]], na.rm = TRUE) + if(any((geoPopulations - 0.0) < .Machine$double.eps)){ + zero_ind <- which((geoPopulations - 0.0) < .Machine$double.eps) + for(rcat in c("r_whi","r_bla","r_his","r_asi","r_oth") ){ + census[[rcat]][zero_ind] <- mean(census[[rcat]], na.rm=TRUE) } } @@ -206,18 +215,11 @@ census_helper_new <- function( #Check if geolocation missing from census object if(any(is.na(voters.census$r_whi))){ miss_ind <- which(is.na(voters.census$r_whi)) - message("The following locations in the voter.file are not available in the census data.", + stop("The following locations in the voter.file are not available in the census data ", paste0("(listed as ", paste0(c("state",geo.merge), collapse="-"),"):\n"), paste(do.call(paste, c(unique(voters.census[miss_ind, c("state",geo.merge)]), sep="-")), collapse = ", ")) - if(skip_bad_geos == TRUE) { - message("NOTE: Skipping unavailable geolocations. Returning partial data set.") - voters.census <- voters.census[!is.na(voters.census$r_whi),] - } - else { - stop("Stopping predictions. Please revise census data and/or verify the correct year is being supplied. To skip these rows use 'skip_bad_geos = TRUE'") - } } # } diff --git a/R/get_census_api.R b/R/get_census_api.R index e0abc83..97d1e81 100644 --- a/R/get_census_api.R +++ b/R/get_census_api.R @@ -5,9 +5,10 @@ #' This function obtains U.S. Census data via the public API. User #' can specify the variables and region(s) for which to obtain data. #' -#' @inheritParams get_census_data #' @param data_url URL root of the API, #' e.g., \code{"https://api.census.gov/data/2020/dec/pl"}. +#' @param key A required character object containing user's Census API key, +#' which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. #' @param var.names A character vector of variables to get, #' e.g., \code{c("P2_005N", "P2_006N", "P2_007N", "P2_008N")}. #' If there are more than 50 variables, then function will automatically @@ -22,7 +23,7 @@ #' @examples #' \dontrun{ #' get_census_api( -#' data_url = "https://api.census.gov/data/2020/dec/pl", +#' data_url = "https://api.census.gov/data/2020/dec/pl", key = "...", #' var.names = c("P2_005N", "P2_006N", "P2_007N", "P2_008N"), region = "for=county:*&in=state:34" #' ) #' } @@ -32,13 +33,7 @@ #' \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. #' #' @keywords internal -get_census_api <- function( - data_url, - key = Sys.getenv("CENSUS_API_KEY"), - var.names, - region, - retry = 0 -) { +get_census_api <- function(data_url, key, var.names, region, retry = 0) { if (length(var.names) > 50) { var.names <- vec_to_chunk(var.names) # Split variables into a list get <- lapply(var.names, function(x) paste(x, sep = "", collapse = ",")) diff --git a/R/get_census_api_2.R b/R/get_census_api_2.R index 4b8cccf..e0a9854 100644 --- a/R/get_census_api_2.R +++ b/R/get_census_api_2.R @@ -6,9 +6,10 @@ #' It is used by the \code{get_census_api} function. The user should not need to call this #' function directly. #' -#' @inheritParams get_census_data #' @param data_url URL root of the API, #' e.g., \code{"https://api.census.gov/data/2020/dec/pl"}. +#' @param key A required character object containing user's Census API key, +#' which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. #' @param get A character vector of variables to get, #' e.g., \code{c("P2_005N", "P2_006N", "P2_007N", "P2_008N")}. #' If there are more than 50 variables, then function will automatically @@ -21,7 +22,7 @@ #' If unsuccessful, function prints the URL query that was constructed. #' #' @examples -#' \dontrun{try(get_census_api_2(data_url = "https://api.census.gov/data/2020/dec/pl", +#' \dontrun{try(get_census_api_2(data_url = "https://api.census.gov/data/2020/dec/pl", key = "...", #' get = c("P2_005N", "P2_006N", "P2_007N", "P2_008N"), region = "for=county:*&in=state:34"))} #' #' @references @@ -29,13 +30,7 @@ #' \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. #' #' @keywords internal -get_census_api_2 <- function( - data_url, - key = Sys.getenv("CENSUS_API_KEY"), - get, - region, - retry = 3 -){ +get_census_api_2 <- function(data_url, key, get, region, retry = 3){ if(length(get) > 1) { get <- paste(get, collapse=',', sep='') } diff --git a/R/get_census_data.R b/R/get_census_data.R index 9eb735d..dd86516 100644 --- a/R/get_census_data.R +++ b/R/get_census_data.R @@ -4,13 +4,8 @@ #' for specified state(s). Using this function to download Census data in advance #' can save considerable time when running \code{predict_race} and \code{census_helper}. #' -#' @param key A character string containing a valid Census API key, -#' which can be requested from the -#' [U.S. Census API key signup page](https://api.census.gov/data/key_signup.html). -#' -#' By default, attempts to find a census key stored in an -#' [environment variable][Sys.getenv] named `CENSUS_API_KEY`. -#' +#' @param key A required character object containing a valid Census API key, +#' which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. #' @param states which states to extract Census data for, e.g., \code{c("NJ", "NY")}. #' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on #' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). @@ -36,23 +31,19 @@ #' #' @export #' -#' @examples -#' \dontrun{get_census_data(states = c("NJ", "NY"), age = TRUE, sex = FALSE)} -#' \dontrun{get_census_data(states = "MN", age = FALSE, sex = FALSE, year = "2020")} -get_census_data <- function( - key = Sys.getenv("CENSUS_API_KEY"), - states, - age = FALSE, - sex = FALSE, - year = "2020", - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), - retry = 3, - county.list = NULL -) { - key <- validate_key(key) +#' @examples +#' \dontrun{get_census_data(key = "...", states = c("NJ", "NY"), age = TRUE, sex = FALSE)} +#' \dontrun{get_census_data(key = "...", states = "MN", age = FALSE, sex = FALSE, year = "2020")} +get_census_data <- function(key = NULL, states, age = FALSE, sex = FALSE, year = "2020", census.geo = "block", retry = 3, county.list = NULL) { - census.geo <- tolower(census.geo) - census.geo <- rlang::arg_match(census.geo) + if (is.null(key)) { + # Matches tidycensus name for env var + key <- Sys.getenv("CENSUS_API_KEY") + } + + if (missing(key) | key == "") { + stop('Must enter valid Census API key, which can be requested at https://api.census.gov/data/key_signup.html.') + } states <- toupper(states) @@ -84,18 +75,6 @@ get_census_data <- function( county <- census_geo_api(key, s, geo = "county", age, sex, year, retry) CensusObj[[s]]$county <- county } - - if (census.geo == "zcta") { - if (!is.null(county.list)) { - cli::cli_abort(c( - "The {.arg county.list} argument must be set to {.code NULL} - when {.arg census_geo} is {.val zcta}, - because the Census Bureau does release data that divides ZCTAs by county." - )) - } - - CensusObj[[s]]$zcta <- census_geo_api(key, s, geo = "zcta", age, sex, year, retry) - } } return(CensusObj) } \ No newline at end of file diff --git a/R/merge_names.R b/R/merge_names.R index c1e3d8e..74e417c 100644 --- a/R/merge_names.R +++ b/R/merge_names.R @@ -1,102 +1,141 @@ +# ------------------------------ +# Helper: resolve cache location +# ------------------------------ +wru_cache_dir <- function() { + opt <- getOption("wru_data_wd", default = FALSE) + if (isTRUE(opt)) { + # TRUE means: use current working directory + return(getwd()) + } + if (is.character(opt) && length(opt) == 1 && !is.na(opt) && nzchar(opt)) { + dir.create(path.expand(opt), recursive = TRUE, showWarnings = FALSE) + return(path.expand(opt)) + } + # default: tempdir for this session + tempdir() +} + +#' Preflight for name data (persistent, pinned, and verified) +#' Downloads 4 name dictionaries to a persistent directory if missing. +#' Set options(wru_data_wd = TRUE) to cache in getwd(); otherwise uses tempdir(). +wru_data_preflight <- function() { + dest <- ifelse(getOption("wru_data_wd", default = FALSE), getwd(), tempdir()) + files_needed <- c( + "wru-data-first_c.rds", + "wru-data-mid_c.rds", + "wru-data-census_last_c.rds", + "wru-data-last_c.rds" + ) + missing <- files_needed[!file.exists(file.path(dest, files_needed))] + if (length(missing) == 0L) return(invisible(TRUE)) + + # Use the release that actually has these assets (matches CRAN code). + tag_to_use <- "v2.0.0" + + # Try download; piggyback ignores .token when "" (same as CRAN code). + tryCatch( + { + piggyback::pb_download( + file = missing, + repo = "kosukeimai/wru", + tag = tag_to_use, + dest = dest, + overwrite = TRUE, + .token = "" + ) + }, + error = function(e) { + stop( + sprintf( + "wru_data_preflight() failed to fetch: %s\nReason: %s", + paste(missing, collapse = ", "), + conditionMessage(e) + ), + call. = FALSE + ) + } + ) + + # Verify they arrived; if not, guide the user. + still_missing <- files_needed[!file.exists(file.path(dest, files_needed))] + if (length(still_missing) > 0L) { + stop( + paste0( + "wru_data_preflight(): some files are still missing after download: ", + paste(still_missing, collapse = ", "), + "\nCheck network/GitHub access or download these assets from the ", + "wru v2.0.0 release and place them in: ", normalizePath(dest), + "\nThen re-run." + ), + call. = FALSE + ) + } + invisible(TRUE) +} + #' Surname probability merging function. #' #' \code{merge_names} merges names in a user-input dataset with corresponding -#' race/ethnicity probabilities derived from both the U.S. Census Surname List -#' and Spanish Surname List and voter files from states in the Southern U.S. -#' -#' This function allows users to match names in their dataset with database entries -#' estimating P(name | ethnicity) for each of the five major racial groups for each -#' name. The database probabilities are derived from both the U.S. Census Surname List -#' and Spanish Surname List and voter files from states in the Southern U.S. -#' -#' By default, the function matches names as follows: -#' 1) Search raw surnames in the database; -#' 2) Remove any punctuation and search again; -#' 3) Remove any spaces and search again; -#' 4) Remove suffixes (e.g., "Jr") and search again (last names only) -#' 5) Split double-barreled names into two parts and search first part of name; -#' 6) Split double-barreled names into two parts and search second part of name; -#' -#' Each step only applies to names not matched in a previous step. -#' Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. -#' -#' Note: Any name appearing only on the Spanish Surname List is assigned a -#' probability of 1 for Hispanics/Latinos and 0 for all other racial groups. +#' race/ethnicity probabilities derived from both the U.S. Census Surname List +#' and Spanish Surname List and voter files from states in the Southern U.S. #' -#' @param voter.file An object of class \code{data.frame}. Must contain a row for each individual being predicted, -#' as well as a field named \code{\var{last}} containing each individual's surname. -#' If first name is also being used for prediction, the file must also contain a field -#' named \code{\var{first}}. If middle name is also being used for prediction, the field -#' must also contain a field named \code{\var{middle}}. -#' @param namesToUse A character vector identifying which names to use for the prediction. -#' The default value is \code{"last"}, indicating that only the last name will be used. -#' Other options are \code{"last, first"}, indicating that both last and first names will be -#' used, and \code{"last, first, middle"}, indicating that last, first, and middle names will all -#' be used. -#' @param census.surname A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -#' function will call \code{merge_surnames} to merge in Pr(Race | Surname) -#' from U.S. Census Surname List (2000, 2010, or 2020) and Spanish Surname List. -#' If \code{FALSE}, user must provide a \code{name.dictionary} (see below). -#' Default is \code{TRUE}. -#' @param table.surnames An object of class \code{data.frame} provided by the -#' users as an alternative surname dictionary. It will consist of a list of -#' U.S. surnames, along with the associated probabilities P(name | ethnicity) -#' for ethnicities: white, Black, Hispanic, Asian, and other. Default is \code{NULL}. -#' (\code{\var{last_name}} for U.S. surnames, \code{\var{p_whi_last}} for White, -#' \code{\var{p_bla_last}} for Black, \code{\var{p_his_last}} for Hispanic, -#' \code{\var{p_asi_last}} for Asian, \code{\var{p_oth_last}} for other). -#' @param table.first See \code{\var{table.surnames}}. -#' @param table.middle See \code{\var{table.surnames}}. -#' @param impute.missing See \code{predict_race}. -#' @param model See \code{predict_race}. -#' @param clean.names A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -#' any surnames in \code{\var{voter.file}} that cannot initially be matched -#' to the database will be cleaned, according to U.S. Census specifications, -#' in order to increase the chance of finding a match. Default is \code{TRUE}. -#' @return Output will be an object of class \code{data.frame}. It will -#' consist of the original user-input data with additional columns that -#' specify the part of the name matched with Census data (\code{\var{surname.match}}), -#' and the probabilities Pr(Race | Surname) for each racial group -#' (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, -#' \code{\var{p_his}} for Hispanic/Latino, -#' \code{\var{p_asi}} for Asian and Pacific Islander, and -#' \code{\var{p_oth}} for Other/Mixed). -#' @importFrom dplyr coalesce -#' @examples -#' data(voters) -#' \dontrun{try(merge_names(voters, namesToUse = "surname", census.surname = TRUE))} +#' @param voter.file data.frame with at least a \code{surname} column; \code{first} and \code{middle} if used +#' @param namesToUse "surname", "surname, first", or "surname, first, middle" +#' @param census.surname logical; if TRUE, use Census/Spanish surname list +#' @param table.surnames Optional custom last-name dictionary (schema like shipped RDS) +#' @param table.first Optional custom first-name dictionary +#' @param table.middle Optional custom middle-name dictionary +#' @param clean.names logical; try cleaning/fallback passes +#' @param impute.missing logical; impute c_* NA with column means (else 1) +#' @param model kept for API parity +#' @param return.unmatched logical; add *_matched boolean columns +#' @return data.frame with probabilities (and flags if requested) #' @keywords internal -merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = NULL, table.first = NULL, table.middle = NULL, clean.names = TRUE, impute.missing = FALSE, model = "BISG") { - - # check the names +merge_names <- function( + voter.file, + namesToUse, + census.surname, + table.surnames = NULL, + table.first = NULL, + table.middle = NULL, + clean.names = TRUE, + impute.missing = FALSE, + model = "BISG", + return.unmatched = TRUE +) { + # --- arg checks ------------------------------------------------------------ if (namesToUse == "surname") { if (!("surname" %in% names(voter.file))) { stop("Voter data frame needs to have a column named 'surname'.") } } else if (namesToUse == "surname, first") { - if (!("surname" %in% names(voter.file)) || !("first" %in% names(voter.file))) { - stop("Voter data frame needs to have a column named 'surname' and a column called 'first'.") + if (!all(c("surname","first") %in% names(voter.file))) { + stop("Voter data frame needs 'surname' and 'first'.") } } else if (namesToUse == "surname, first, middle") { - if (!("surname" %in% names(voter.file)) || !("first" %in% names(voter.file)) || - !("middle" %in% names(voter.file))) { - stop("Voter data frame needs to have a column named 'surname', a column called 'first', and a column called 'middle'.") + if (!all(c("surname","first","middle") %in% names(voter.file))) { + stop("Voter data frame needs 'surname', 'first', and 'middle'.") } } - + + # --- ensure data present (downloads only if missing) ----------------------- wru_data_preflight() - - path <- ifelse(getOption("wru_data_wd", default = FALSE), getwd(), tempdir()) - - first_c <- readRDS(paste0(path, "/wru-data-first_c.rds")) - mid_c <- readRDS(paste0(path, "/wru-data-mid_c.rds")) - if(census.surname){ - last_c <- readRDS(paste0(path, "/wru-data-census_last_c.rds")) - } else { - last_c <- readRDS(paste0(path, "/wru-data-last_c.rds")) + + path <- wru_cache_dir() + first_c <- readRDS(file.path(path, "wru-data-first_c.rds")) + mid_c <- readRDS(file.path(path, "wru-data-mid_c.rds")) + last_c <- readRDS(file.path( + path, + if (isTRUE(census.surname)) "wru-data-census_last_c.rds" else "wru-data-last_c.rds" + )) + + if (!is.data.frame(first_c) || !is.data.frame(mid_c) || !is.data.frame(last_c)) { + stop("wru cached name dictionaries are not data.frames; clear cache and re-run wru_data_preflight().") } p_eth <- c("c_whi", "c_bla", "c_his", "c_asi", "c_oth") + + # optional custom dictionaries; align names to shipped schema if (is.null(table.surnames)) { lastNameDict <- last_c } else { @@ -108,26 +147,22 @@ merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = firstNameDict <- first_c } else { firstNameDict <- table.first - firstNameDict[is.na(firstNameDict)] <- 0 names(firstNameDict) <- names(first_c) + firstNameDict[is.na(firstNameDict)] <- 0 } if (is.null(table.middle)) { middleNameDict <- mid_c } else { middleNameDict <- table.middle - middleNameDict[is.na(middleNameDict)] <- 0 names(middleNameDict) <- names(mid_c) + middleNameDict[is.na(middleNameDict)] <- 0 } - - nameDict <- list( - "first" = firstNameDict, - "middle" = middleNameDict, - "last" = lastNameDict - ) - - ## Convert names in voter file to upper case + + nameDict <- list(first = firstNameDict, middle = middleNameDict, last = lastNameDict) + + # --- initial normalization & merges --------------------------------------- df <- voter.file - + df$lastname.match <- df$lastname.upper <- toupper(as.character(df$surname)) if (grepl("first", namesToUse)) { df$firstname.match <- df$firstname.upper <- toupper(as.character(df$first)) @@ -136,8 +171,8 @@ merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = df$middlename.match <- df$middlename.upper <- toupper(as.character(df$middle)) df$middlename.match[is.na(df$middlename.match)] <- "" } - - ## Merge Surnames with Census List (No Cleaning Yet) + + # first pass merges (raw) df <- merge(df, lastNameDict, by.x = "lastname.match", by.y = "last_name", all.x = TRUE, sort = FALSE) if (grepl("first", namesToUse)) { df <- merge(df, firstNameDict, by.x = "firstname.match", by.y = "first_name", all.x = TRUE, sort = FALSE) @@ -145,201 +180,256 @@ merge_names <- function(voter.file, namesToUse, census.surname, table.surnames = if (grepl("middle", namesToUse)) { df <- merge(df, middleNameDict, by.x = "middlename.match", by.y = "middle_name", all.x = TRUE, sort = FALSE) } - - if (namesToUse == "surname" && sum(!(df$lastname.upper %in% lastNameDict$last_name)) == 0) { - return(df[, c(names(voter.file), "lastname.match", paste0(p_eth, "_last"))]) + + # compute flags now so early-returns also include them + if (isTRUE(return.unmatched)) { + if ("c_whi_last" %in% names(df)) df$last_matched <- !is.na(df$c_whi_last) + if ("c_whi_first" %in% names(df)) df$first_matched <- !is.na(df$c_whi_first) + if ("c_whi_middle" %in% names(df)) df$middle_matched <- !is.na(df$c_whi_middle) + } + + # unified early-return helper + fast_return <- function() { + prob_cols <- c(paste0(p_eth, "_last")) + name_cols <- c("lastname.match") + flag_cols <- character(0) + if (isTRUE(return.unmatched) && "last_matched" %in% names(df)) flag_cols <- c(flag_cols, "last_matched") + if (grepl("first", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_first")) + name_cols <- c(name_cols, "firstname.match") + if (isTRUE(return.unmatched) && "first_matched" %in% names(df)) flag_cols <- c(flag_cols, "first_matched") + } + if (grepl("middle", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_middle")) + name_cols <- c(name_cols, "middlename.match") + if (isTRUE(return.unmatched) && "middle_matched" %in% names(df)) flag_cols <- c(flag_cols, "middle_matched") + } + cols <- c(names(voter.file), name_cols, flag_cols, prob_cols) + df[, cols, drop = FALSE] } - if (namesToUse == "surname, first" && sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && - sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0) { - return(df[, c(names(voter.file), "lastname.match", "firstname.match", paste0(p_eth, "_last"), paste0(p_eth, "_first"))]) + + if (namesToUse == "surname" && + sum(!(df$lastname.upper %in% lastNameDict$last_name)) == 0) { + return(fast_return()) } - if (namesToUse == "surname, first, middle" && sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && - sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0 && sum(!(df$middlename.upper %in% middleNameDict$middle_name)) == 0) { - return(df[, c(names(voter.file), "lastname.match", "firstname.match", "middlename.match", paste0(p_eth, "_last"), paste0(p_eth, "_first"), paste0(p_eth, "_middle"))]) + if (namesToUse == "surname, first" && + sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && + sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0) { + return(fast_return()) } - - ## Clean names (if specified by user) + if (namesToUse == "surname, first, middle" && + sum(!(df$lastname.match %in% lastNameDict$last_name)) == 0 && + sum(!(df$firstname.upper %in% firstNameDict$first_name)) == 0 && + sum(!(df$middlename.upper %in% middleNameDict$middle_name))== 0) { + return(fast_return()) + } + + # --- cleaning / fallback passes (dimension-safe) -------------------------- if (clean.names) { - for (nameType in strsplit(namesToUse, ", ")[[1]]) { - if(nameType=="surname"){ - nameType <- "last" - } - df1 <- df[!is.na(df[, paste("c_whi_", nameType, sep = "")]), ] # Matched names - df2 <- df[is.na(df[, paste("c_whi_", nameType, sep = "")]), ] # Unmatched names - - ## Remove All Punctuation and Try Merge Again + for (nameType0 in strsplit(namesToUse, ", ")[[1]]) { + nameType <- if (nameType0 == "surname") "last" else nameType0 + + col_prob <- paste0("c_whi_", nameType) + col_match <- paste0(nameType, "name.match") + col_upper <- paste0(nameType, "name.upper") + + df1 <- df[!is.na(df[, col_prob]), , drop = FALSE] + df2 <- df[ is.na(df[, col_prob]), , drop = FALSE] + + ## 1) strip punctuation if (nrow(df2) > 0) { - df2[, paste(nameType, "name.match", sep = "")] <- gsub("[^[:alnum:] ]", "", df2[, paste(nameType, "name.upper", sep = "")]) - - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + df2[, col_match] <- gsub("[^[:alnum:] ]", "", df2[, col_upper]) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, names(df1)] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, names(df1), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, , drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - ## Remove All Spaces and Try Merge Again + + ## 2) strip spaces if (nrow(df2) > 0) { - df2[, paste(nameType, "name.match", sep = "")] <- gsub(" ", "", df2[, paste(nameType, "name.match", sep = "")]) - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + df2[, col_match] <- gsub(" ", "", df2[, col_match]) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, names(df1)] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, names(df1), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, , drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - # Edits specific to common issues with last names - if (nameType == "last" & nrow(df2) > 0) { - - ## Remove Jr/Sr/III Suffixes for last names - suffix <- c("JUNIOR", "SENIOR", "THIRD", "III", "JR", " II", " J R", " S R", " IV") - for (i in 1:length(suffix)) { - df2$lastname.match <- ifelse(substr(df2$lastname.match, nchar(df2$lastname.match) - (nchar(suffix)[i] - 1), nchar(df2$lastname.match)) == suffix[i], - substr(df2$lastname.match, 1, nchar(df2$lastname.match) - nchar(suffix)[i]), + + ## 3) last-name suffix cleanup + if (nameType == "last" && nrow(df2) > 0) { + suffix <- c("JUNIOR","SENIOR","THIRD","III","JR"," II"," J R"," S R"," IV") + for (suf in suffix) { + df2$lastname.match <- ifelse( + substr(df2$lastname.match, + nchar(df2$lastname.match) - (nchar(suf) - 1), + nchar(df2$lastname.match)) == suf, + substr(df2$lastname.match, 1, nchar(df2$lastname.match) - nchar(suf)), df2$lastname.match ) } - df2$lastname.match <- ifelse(nchar(df2$lastname.match) >= 7, - ifelse(substr(df2$lastname.match, nchar(df2$lastname.match) - 1, nchar(df2$lastname.match)) == "SR", - substr(df2$lastname.match, 1, nchar(df2$lastname.match) - 2), - df2$lastname.match - ), + df2$lastname.match <- ifelse( + nchar(df2$lastname.match) >= 7 & + substr(df2$lastname.match, nchar(df2$lastname.match) - 1, nchar(df2$lastname.match)) == "SR", + substr(df2$lastname.match, 1, nchar(df2$lastname.match) - 2), df2$lastname.match - ) # Remove "SR" only if name has at least 7 characters - + ) + df2 <- merge( - df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], - lastNameDict, by.x = "lastname.match", by.y = "last_name", - all.x = TRUE, sort = FALSE) - df2 <- df2[, names(df1)] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = ""), ]), ]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2[, !grepl(paste0("_", nameType), names(df2))], + lastNameDict, + by.x = "lastname.match", by.y = "last_name", + all.x = TRUE, sort = FALSE + ) + df2 <- df2[, names(df1), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, , drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - - ## Names with Hyphens or Spaces, e.g. Double-Barreled Names + + ## 4) double-barreled (first half then second half) if (nrow(df2) > 0) { df2$name2 <- df2$name1 <- NA - df2$name1[grep("-", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep("-", df2[, paste(nameType, "name.upper", sep = "")], value = T), "-"), "[", 1) - df2$name2[grep("-", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep("-", df2[, paste(nameType, "name.upper", sep = "")], value = T), "-"), "[", 2) - df2$name1[grep(" ", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep(" ", df2[, paste(nameType, "name.upper", sep = "")], value = T), " "), "[", 1) - df2$name2[grep(" ", df2[, paste(nameType, "name.upper", sep = "")])] <- sapply(strsplit(grep(" ", df2[, paste(nameType, "name.upper", sep = "")], value = T), " "), "[", 2) - - ## Use first half of name to merge in priors - df2[, paste(nameType, "name.match", sep = "")] <- as.character(df2$name1) - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + + # hyphen split + idx <- grep("-", df2[, col_upper]) + if (length(idx)) { + parts <- strsplit(df2[idx, col_upper], "-") + df2$name1[idx] <- vapply(parts, `[`, character(1), 1) + df2$name2[idx] <- vapply(parts, `[`, character(1), 2) + } + # space split (fill only NAs left by hyphen step) + idx <- grep(" ", df2[, col_upper]) + if (length(idx)) { + parts <- strsplit(df2[idx, col_upper], " ") + fill1 <- vapply(parts, `[`, character(1), 1) + fill2 <- vapply(parts, `[`, character(1), 2) + df2$name1[idx] <- ifelse(is.na(df2$name1[idx]), fill1, df2$name1[idx]) + df2$name2[idx] <- ifelse(is.na(df2$name2[idx]), fill2, df2$name2[idx]) + } + + # first half + df2[, col_match] <- as.character(df2$name1) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, c(names(df1), "name1", "name2")] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, c(names(df1), "name1", "name2"), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, !(names(df2) %in% c("name1","name2")), drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - - ## Use second half of name to merge in priors for rest + if (nrow(df2) > 0) { - df2[, paste(nameType, "name.match", sep = "")] <- as.character(df2$name2) - df2 <- merge(df2[, !grepl(paste("_", nameType, sep = ""), names(df2))], nameDict[[nameType]], + # second half + df2[, col_match] <- as.character(df2$name2) + df2 <- merge( + df2[, !grepl(paste0("_", nameType), names(df2))], + nameDict[[nameType]], all.x = TRUE, - by.x = paste(nameType, "name.match", sep = ""), by.y = paste(nameType, "name", sep = "_"), + by.x = col_match, by.y = paste0(nameType, "_name"), sort = FALSE ) - df2 <- df2[, c(names(df1), "name1", "name2")] # reorder the columns - - if (sum(!is.na(df2[, paste("c_whi_", nameType, sep = ""), ])) > 0) { - df1 <- rbind(df1, df2[!is.na(df2[, paste("c_whi_", nameType, sep = "")]), !(names(df2) %in% c("name1", "name2"))]) - df2 <- df2[is.na(df2[, paste("c_whi_", nameType, sep = "")]), ] + df2 <- df2[, c(names(df1), "name1", "name2"), drop = FALSE] + + keep <- !is.na(df2[, col_prob]) + if (any(keep)) { + df1 <- rbind(df1, df2[keep, !(names(df2) %in% c("name1","name2")), drop = FALSE]) + df2 <- df2[!keep, , drop = FALSE] } } - + + # stitch back if (nrow(df2) > 0) { - df <- rbind(df1, df2[, !(names(df2) %in% c("name1", "name2"))]) + df <- rbind(df1, df2[, !(names(df2) %in% c("name1","name2")), drop = FALSE]) } else { df <- df1 } } } - - - ## For unmatched names, just fill with an column mean if impute is true, or with constant if false + + # --- reporting ------------------------------------------------------------- c_miss_last <- mean(is.na(df$c_whi_last)) if (c_miss_last > 0) { - message(paste(paste(sum(is.na(df$c_whi_last)), " (", round(100 * mean(is.na(df$c_whi_last)), 1), "%) individuals' last names were not matched.", sep = ""))) + message(paste0(sum(is.na(df$c_whi_last)), " (", round(100 * c_miss_last, 1), "%) individuals' last names were not matched.")) } if (grepl("first", namesToUse)) { c_miss_first <- mean(is.na(df$c_whi_first)) if (c_miss_first > 0) { - message(paste(paste(sum(is.na(df$c_whi_first)), " (", round(100 * mean(is.na(df$c_whi_first)), 1), "%) individuals' first names were not matched.", sep = ""))) + message(paste0(sum(is.na(df$c_whi_first)), " (", round(100 * c_miss_first, 1), "%) individuals' first names were not matched.")) } } if (grepl("middle", namesToUse)) { c_miss_mid <- mean(is.na(df$c_whi_middle)) if (c_miss_mid > 0) { - message(paste(paste(sum(is.na(df$c_whi_middle)), " (", round(100 * mean(is.na(df$c_whi_middle)), 1), "%) individuals' middle names were not matched.", sep = ""))) + message(paste0(sum(is.na(df$c_whi_middle)), " (", round(100 * c_miss_mid, 1), "%) individuals' middle names were not matched.")) } } - + + # flags BEFORE imputation + if (isTRUE(return.unmatched)) { + if ("c_whi_last" %in% names(df)) df$last_matched <- !is.na(df$c_whi_last) + if ("c_whi_first" %in% names(df)) df$first_matched <- !is.na(df$c_whi_first) + if ("c_whi_middle" %in% names(df)) df$middle_matched <- !is.na(df$c_whi_middle) + } + + # --- imputation ------------------------------------------------------------ if (impute.missing) { - impute.vec <- colMeans(df[, grep("c_", names(df), value = TRUE)], na.rm = TRUE) - for (i in grep("c_", names(df), value = TRUE)) { + impute.vec <- colMeans(df[, grep("^c_", names(df), value = TRUE), drop = FALSE], na.rm = TRUE) + for (i in grep("^c_", names(df), value = TRUE)) { df[, i] <- dplyr::coalesce(df[, i], impute.vec[i]) } } else { - for (i in grep("c_", names(df), value = TRUE)) { + for (i in grep("^c_", names(df), value = TRUE)) { df[, i] <- dplyr::coalesce(df[, i], 1) } } - - # return the data - if (namesToUse == "surname") { - return(df[, c(names(voter.file), "lastname.match", paste(p_eth, "last", sep = "_"))]) - } else if (namesToUse == "surname, first") { - return(df[, c( - names(voter.file), "lastname.match", "firstname.match", - paste(p_eth, "last", sep = "_"), paste(p_eth, "first", sep = "_") - )]) - } else if (namesToUse == "surname, first, middle") { - return(df[, c( - names(voter.file), "lastname.match", "firstname.match", "middlename.match", - paste(p_eth, "last", sep = "_"), paste(p_eth, "first", sep = "_"), paste(p_eth, "middle", sep = "_") - )]) + + # --- unified return -------------------------------------------------------- + prob_cols <- c(paste0(p_eth, "_last")) + name_cols <- c("lastname.match") + flag_cols <- character(0) + if (isTRUE(return.unmatched) && "last_matched" %in% names(df)) flag_cols <- c("last_matched", flag_cols) + + if (grepl("first", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_first")) + name_cols <- c(name_cols, "firstname.match") + if (isTRUE(return.unmatched) && "first_matched" %in% names(df)) flag_cols <- c(flag_cols, "first_matched") + } + if (grepl("middle", namesToUse)) { + prob_cols <- c(prob_cols, paste0(p_eth, "_middle")) + name_cols <- c(name_cols, "middlename.match") + if (isTRUE(return.unmatched) && "middle_matched" %in% names(df)) flag_cols <- c(flag_cols, "middle_matched") } + + cols <- c(names(voter.file), name_cols, flag_cols, prob_cols) + df[, cols, drop = FALSE] } - -#' Preflight for name data -#' -#' Checks if namedata is available in the current working directory, if not -#' downloads it from github using piggyback. By default, wru will download the -#' data to a temporary directory that lasts as long as your session does. -#' However, you may wish to set the \code{wru_data_wd} option to save the -#' downloaded data to your current working directory for more permanence. -#' -#' @importFrom piggyback pb_download -wru_data_preflight <- function() { - dest <- ifelse(getOption("wru_data_wd", default = FALSE), getwd(), tempdir()) - tryCatch( - # Oddity of conditions for .token. Ignores token if is "" - piggyback::pb_download(repo = "kosukeimai/wru", dest = dest, .token = "", tag = "v2.0.0"), - error = function(e) message("There was an error retrieving data: ", e$message) - ) -} diff --git a/R/predict_race.R b/R/predict_race.R index 1f31b7c..6d092a2 100644 --- a/R/predict_race.R +++ b/R/predict_race.R @@ -40,17 +40,11 @@ #' must have columns named \code{county}, \code{tract}, and \code{block}. #' If \code{\var{census.geo} = "place"}, then \code{\var{voter.file}} #' must have column named \code{place}. -#' If `census.geo = "zcta"`, then `voter.file` must have column named `zcta`. #' Specifying \code{\var{census.geo}} will call \code{census_helper} function #' to merge Census geographic data at specified level of geography. -#' -#' @param census.key A character object specifying user's Census API key. -#' Required if `census.geo` is specified, because a valid Census API key is -#' required to download Census geographic data. -#' -#' If [`NULL`], the default, attempts to find a census key stored in an -#' [environment variable][Sys.getenv] named `CENSUS_API_KEY`. -#' +#' @param census.key A character object specifying user's Census API +#' key. Required if \code{\var{census.geo}} is specified, because +#' a valid Census API key is required to download Census geographic data. #' @param census.data A list indexed by two-letter state abbreviations, #' which contains pre-saved Census geographic data. #' Can be generated using \code{get_census_data} function. @@ -74,10 +68,7 @@ #' Whatever the name of the party registration field in \code{\var{voter.file}}, #' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other. #' @param retry The number of retries at the census website if network interruption occurs. -#' @param impute.missing Logical, defaults to TRUE. Should missing be imputed? -#' @param skip_bad_geos Logical. Option to have the function skip any geolocations that are not present -#' in the census data, returning a partial data set. Default is set to \code{FALSE}, in which case it -#' will break and provide error message with a list of offending geolocations. +#' @param impute.missing Logical, defaults to TRUE. Should missing be imputed? FALSE = A constant(1) #' @param use.counties A logical, defaulting to FALSE. Should census data be filtered by counties #' available in \var{census.data}? #' @param model Character string, either "BISG" (default) or "fBISG" (for error-correction, @@ -92,13 +83,15 @@ #' @param race.init Vector of initial race for each observation in voter.file. #' Must be an integer vector, with 1=white, 2=black, 3=hispanic, 4=asian, and #' 5=other. Defaults to values obtained using \code{model="BISG_surname"}. +#' @param return.unmatched Generates Boolean columns for each name reporting +#' whether a match was made. Default is set to \code{TRUE}. #' @param control List of control arguments only used when \code{model="fBISG"}, including -#' \describe{ -#' \item{iter}{Number of MCMC iterations. Defaults to 1000.} -#' \item{burnin}{Number of iterations discarded as burnin. Defaults to half of \code{iter}.} -#' \item{verbose}{Print progress information. Defaults to \code{TRUE}.} -#' \item{me.correct}{Boolean. Should the model correct measurement error for \code{races|geo}? Defaults to \code{TRUE}.} -#' \item{seed}{RNG seed. If \code{NULL}, a seed is generated and returned as an attribute for reproducibility.} +#' \itemize{ +#' \item{iter}{ Number of MCMC iterations. Defaults to 1000.} +#' \item{burnin}{ Number of iterations discarded as burnin. Defaults to half of \code{iter}.} +#' \item{verbose}{ Print progress information. Defaults to \code{TRUE}.} +#' \item{me.correct}{ Boolean. Should the model correcting measurement error for \code{races|geo}? Defaults to \code{TRUE}.} +#' \item{seed}{ RNG seed. If \code{NULL}, a seed is generated and returned as an attribute for reproducibility.} #' } #' #' @return Output will be an object of class \code{data.frame}. It will @@ -115,52 +108,36 @@ #' #' data(voters) #' try(predict_race(voter.file = voters, surname.only = TRUE)) #' \dontrun{ -#' try(predict_race(voter.file = voters, census.geo = "tract")) +#' try(predict_race(voter.file = voters, census.geo = "tract", census.key = "...")) #' } #' \dontrun{ #' try(predict_race( -#' voter.file = voters, census.geo = "place", year = "2020")) +#' voter.file = voters, census.geo = "place", census.key = "...", year = "2020")) #' } #' \dontrun{ -#' CensusObj <- try(get_census_data(state = c("NY", "DC", "NJ"))) +#' CensusObj <- try(get_census_data("...", state = c("NY", "DC", "NJ"))) #' try(predict_race( #' voter.file = voters, census.geo = "tract", census.data = CensusObj, party = "PID") #' ) #' } #' \dontrun{ -#' CensusObj2 <- try(get_census_data(state = c("NY", "DC", "NJ"), age = T, sex = T)) +#' CensusObj2 <- try(get_census_data(key = "...", state = c("NY", "DC", "NJ"), age = T, sex = T)) #' try(predict_race( #' voter.file = voters, census.geo = "tract", census.data = CensusObj2, age = T, sex = T)) #' } #' \dontrun{ -#' CensusObj3 <- try(get_census_data(state = c("NY", "DC", "NJ"), census.geo = "place")) +#' CensusObj3 <- try(get_census_data(key = "...", state = c("NY", "DC", "NJ"), census.geo = "place")) #' try(predict_race(voter.file = voters, census.geo = "place", census.data = CensusObj3)) #' } #' } #' @export -predict_race <- function( - voter.file, - census.surname = TRUE, - surname.only = FALSE, - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), - census.key = Sys.getenv("CENSUS_API_KEY"), - census.data = NULL, - age = FALSE, - sex = FALSE, - year = "2020", - party = NULL, - retry = 3, - impute.missing = TRUE, - skip_bad_geos = FALSE, - use.counties = FALSE, - model = "BISG", - race.init = NULL, - name.dictionaries = NULL, - names.to.use = "surname", - control = NULL -) { +predict_race <- function(voter.file, census.surname = TRUE, surname.only = FALSE, + census.geo, census.key = NULL, census.data = NULL, age = FALSE, + sex = FALSE, year = "2020", party = NULL, retry = 3, impute.missing = TRUE, + use.counties = FALSE, model = "BISG", race.init = NULL, return.unmatched = TRUE, + name.dictionaries = NULL, names.to.use = "surname", control = NULL) { message("Predicting race for ", year) @@ -183,8 +160,6 @@ predict_race <- function( ) } - census.geo <- tolower(census.geo) - census.geo <- rlang::arg_match(census.geo) # block_group is missing, pull from block if((surname.only == FALSE) && !(missing(census.geo)) && (census.geo == "block_group") && !("block_group" %in% names(voter.file))) { @@ -194,10 +169,21 @@ predict_race <- function( # Adjust voter.file with caseid for ordering at the end voter.file$caseid <- 1:nrow(voter.file) - if (surname.only == FALSE && is.null(census.data)) { + if((surname.only==FALSE) && is.null(census.key) && is.null(census.data)) { + k <- Sys.getenv("CENSUS_API_KEY") + + if(k == "") + stop( + "Please provide a valid Census API key using census.key option.", + " Or set CENSUS_API_KEY in your .Renviron or .Rprofile" + ) + + census.key <- k + } + + if(surname.only==FALSE && is.null(census.data)) { # Otherwise predict_race_new and predict_race_me will both # attempt to pull census_data - census.key <- validate_key(census.key) voter.file$state <- toupper(voter.file$state) states <- unique(voter.file$state) county.list <- split(voter.file$county, voter.file$state) @@ -224,7 +210,7 @@ predict_race <- function( census.data = census.data, retry = retry, impute.missing = impute.missing, - skip_bad_geos = skip_bad_geos, + return.unmatched = return.unmatched, census.surname = census.surname, use.counties = use.counties) } else { @@ -242,24 +228,20 @@ predict_race <- function( if(ctrl$verbose){ message("Using `predict_race` to obtain initial race prediction priors with BISG model") } - - race.init <- predict_race(voter.file = voter.file, - names.to.use = names.to.use, - year = year, - age = age, sex = sex, # not implemented, default to F - census.geo = census.geo, - census.key = census.key, - name.dictionaries = name.dictionaries, - surname.only=surname.only, - census.data = census.data, - retry = retry, - impute.missing = TRUE, - skip_bad_geos = skip_bad_geos, - census.surname = census.surname, - use.counties = use.counties, - model = "BISG", - control = list(verbose=FALSE)) - + race.init <- predict_race_new(voter.file = voter.file, + names.to.use = names.to.use, + year = year, + age = age, sex = sex, # not implemented, default to F + census.geo = census.geo, + census.key = census.key, + name.dictionaries = name.dictionaries, + surname.only=surname.only, + census.data = census.data, + retry = retry, + impute.missing = TRUE, + census.surname = census.surname, + return.unmatched = return.unmatched, + use.counties = use.counties) race.init <- max.col( race.init[, paste0("pred.", c("whi", "bla", "his", "asi", "oth"))], ties.method = "random" diff --git a/R/race_prediction_funs.R b/R/race_prediction_funs.R index 77ff933..db88a33 100644 --- a/R/race_prediction_funs.R +++ b/R/race_prediction_funs.R @@ -1,7 +1,7 @@ #' Internal model fitting functions #' #' These functions are intended for internal use only. Users should use the -#' [predict_race()] interface rather any of these functions directly. +#' \code{race_predict} interface rather any of these functions directly. #' #' These functions fit different versions of WRU. \code{.predict_race_old} fits #' the original WRU model, also known as BISG with census-based surname dictionary. @@ -13,12 +13,12 @@ #' the augmented surname dictionary, and the first and middle name #' dictionaries when making predictions. #' -#' @inheritParams predict_race #' @param voter.file See documentation in \code{race_predict}. #' @param census.surname See documentation in \code{race_predict}. #' @param surname.only See documentation in \code{race_predict}. #' @param surname.year See documentation in \code{race_predict}. #' @param census.geo See documentation in \code{race_predict}. +#' @param census.key See documentation in \code{race_predict}. #' @param census.data See documentation in \code{race_predict}. #' @param age See documentation in \code{race_predict}. #' @param sex See documentation in \code{race_predict}. @@ -26,14 +26,13 @@ #' @param party See documentation in \code{race_predict}. #' @param retry See documentation in \code{race_predict}. #' @param impute.missing See documentation in \code{race_predict}. -#' @param skip_bad_geos See documentation in \code{race_predict}. #' @param names.to.use See documentation in \code{race_predict}. #' @param race.init See documentation in \code{race_predict}. #' @param name.dictionaries See documentation in \code{race_predict}. -#' @param ctrl See `control` in documentation for [predict_race()]. +#' @param ctrl See \code{control} in documentation for \code{race_predict}. #' @param use.counties A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}? #' -#' @inherit predict_race return +#' @return See documentation in \code{race_predict}. #' #' @name modfuns NULL @@ -46,23 +45,10 @@ NULL #' @rdname modfuns #' @keywords internal -.predict_race_old <- function( - voter.file, - census.surname = TRUE, - surname.only = FALSE, - surname.year = 2020, - name.dictionaries = NULL, - census.geo, - census.key = Sys.getenv("CENSUS_API_KEY"), - census.data = NULL, - age = FALSE, - sex = FALSE, - year = "2020", - party, - retry = 3, - impute.missing = TRUE, - use.counties = FALSE -) { +.predict_race_old <- function(voter.file, + census.surname = TRUE, surname.only = FALSE, surname.year = 2020, name.dictionaries = NULL, + census.geo, census.key, census.data = NULL, age = FALSE, sex = FALSE, year = "2020", + party, retry = 3, impute.missing = TRUE, use.counties = FALSE) { # warning: 2020 census data only support prediction when both age and sex are equal to FALSE if ((sex == TRUE || age == TRUE) && (year == "2020")) { @@ -82,22 +68,28 @@ NULL stop("Voter data frame needs to have a column named surname") } } else { - if (missing(census.geo) || is.null(census.geo) || all(is.na(census.geo)) || census.geo %in% c("county", "tract", "block", "place") == FALSE) { + if (missing(census.geo) || is.null(census.geo) || is.na(census.geo) || census.geo %in% c("county", "tract", "block", "place") == FALSE) { stop("census.geo must be either 'county', 'tract', 'block', or 'place'") } else { message(paste("Proceeding with Census geographic data at", census.geo, "level...")) } - if (missing(census.data) || is.null(census.data) || all(is.na(census.data))) { - census.key <- validate_key(census.key) - message("Downloading Census geographic data using provided API key...") + if (missing(census.data) || is.null(census.data) || is.na(census.data)) { + if (missing(census.key) || is.null(census.key) || is.na(census.key)) { + stop("Please provide a valid Census API key using census.key option.") + } else { + message("Downloading Census geographic data using provided API key...") + } } else { if (!("state" %in% names(voter.file))) { stop("voter.file object needs to have a column named state.") } if (sum(toupper(unique(as.character(voter.file$state))) %in% toupper(names(census.data)) == FALSE) > 0) { message("census.data object does not include all states in voter.file object.") - census.key <- validate_key(census.key) - message("Downloading Census geographic data for states not included in census.data object...") + if (missing(census.key) || is.null(census.key) || is.na(census.key)) { + stop("Please provide either a valid Census API key or valid census.data object that covers all states in voter.file object.") + } else { + message("Downloading Census geographic data for states not included in census.data object...") + } } else { message("Using Census geographic data from provided census.data object...") } @@ -269,24 +261,10 @@ NULL #' New race prediction function, implementing classical BISG with augmented #' surname dictionary, as well as first and middle name information. #' @rdname modfuns - -predict_race_new <- function( - voter.file, - names.to.use, - year = "2020", - age = FALSE, - sex = FALSE, - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), - census.key = Sys.getenv("CENSUS_API_KEY"), - name.dictionaries, - surname.only=FALSE, - census.data = NULL, - retry = 0, - impute.missing = TRUE, - skip_bad_geos = FALSE, - census.surname = FALSE, - use.counties = FALSE -) { +predict_race_new <- function(voter.file, names.to.use, year = "2020",age = FALSE, sex = FALSE, + census.geo, census.key = NULL, name.dictionaries, surname.only=FALSE, + census.data = NULL, retry = 0, impute.missing = TRUE, census.surname = FALSE, + return.unmatched = TRUE, use.counties = FALSE) { # Check years if (!(year %in% c("2000", "2010", "2020"))){ @@ -295,9 +273,10 @@ predict_race_new <- function( # Define 2020 race marginal race.margin <- c(r_whi=0.5783619, r_bla=0.1205021, r_his=0.1872988, r_asi=0.06106737, r_oth=0.05276981) - - census.geo <- tolower(census.geo) - census.geo <- rlang::arg_match(census.geo) + # check the geography + if (!missing(census.geo) && (census.geo == "precinct")) { + stop("Error: census_helper function does not currently support merging precinct-level data.") + } vars.orig <- names(voter.file) @@ -346,11 +325,17 @@ predict_race_new <- function( # check the geographies if (surname.only == FALSE) { - message("Proceeding with Census geographic data at ", census.geo, " level...") - + if (!(census.geo %in% c("county", "tract","block_group", "block", "place"))) { + stop("census.geo must be either 'county', 'tract', 'block', 'block_group', or 'place'") + } else { + message(paste("Proceeding with Census geographic data at", census.geo, "level...")) + } if (is.null(census.data)) { - census.key <- validate_key(census.key) - message("Downloading Census geographic data using provided API key...") + if (missing(census.key) || is.null(census.key) || is.na(census.key)) { + stop("Please provide a valid Census API key using census.key option.") + } else { + message("Downloading Census geographic data using provided API key...") + } } else { if (!("state" %in% names(voter.file))) { stop("voter.file object needs to have a column named state.") @@ -358,14 +343,24 @@ predict_race_new <- function( census_data_preflight(census.data, census.geo, year) if (sum(toupper(unique(as.character(voter.file$state))) %in% toupper(names(census.data)) == FALSE) > 0) { message("census.data object does not include all states in voter.file object.") - census.key <- validate_key(census.key) - message("Downloading Census geographic data for states not included in census.data object...") + if (missing(census.key) || is.null(census.key) || is.na(census.key)) { + stop("Please provide either a valid Census API key or valid census.data object that covers all states in voter.file object.") + } else { + message("Downloading Census geographic data for states not included in census.data object...") + } } else { message("Using Census geographic data from provided census.data object...") } } - geo_id_names <- determine_geo_id_names(census.geo) + geo_id_names <- switch( + census.geo, + "county" = c("county"), + "tract" = c("county", "tract"), + "block_group" = c("county", "tract", "block_group"), + "block" = c("county", "tract", "block"), + "place" = c("place") + ) if (!all(geo_id_names %in% names(voter.file))) { stop(message("To use",census.geo,"as census.geo, voter.file needs to include the following column(s):", @@ -382,8 +377,7 @@ predict_race_new <- function( year = year, census.data = census.data, retry = retry, - use.counties = use.counties, - skip_bad_geos = skip_bad_geos + use.counties = use.counties ) } @@ -398,6 +392,7 @@ predict_race_new <- function( table.middle=name.dictionaries[["middle"]], clean.names = TRUE, impute.missing = impute.missing, + return.unmatched = return.unmatched, model = 'BISG') if (surname.only == TRUE) { @@ -415,17 +410,24 @@ predict_race_new <- function( } } - if(impute.missing){ - for(i in ncol(preds)){ - preds[, i] <- dplyr::coalesce(preds[, i], race.margin[i]) - } - } ## Normalize (recycle marginal) preds <- preds/rowSums(preds) - ## Revert to national Pr(Race) for missing predictions + ## Revert to Pr(Race|Surname) for missing predictions + if(impute.missing){ + miss_ind <- !is.finite(preds$c_whi_last) + if(any(miss_ind)){ + preds[miss_ind,] <- voter.file[miss_ind, grep("_last$", names(voter.file))] * + matrix(race.margin, nrow=nrow(voter.file[miss_ind,]), ncol=length(race.margin), byrow = TRUE) + } + } colnames(preds) <- paste("pred", eth, sep = ".") - return(data.frame(cbind(voter.file[c(vars.orig)], preds))) + if(return.unmatched == TRUE) { + return(data.frame(cbind(voter.file, preds)) |> + dplyr::select(vars.orig, any_of(c("last", "first", "middle")), ends_with("_matched"), -starts_with("c_"),-ends_with(".match"), starts_with("pred."))) + } else { + return(data.frame(cbind(voter.file[c(vars.orig)], preds))) + } } @@ -433,30 +435,12 @@ predict_race_new <- function( #' New race prediction function, implementing fBISG (i.e. measurement #' error correction, fully Bayesian model) with augmented #' surname dictionary, as well as first and middle name information. -#' @importFrom dplyr pull #' @rdname modfuns - -predict_race_me <- function( - voter.file, - names.to.use, - year = "2020", - age = FALSE, - sex = FALSE, - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), - census.key = Sys.getenv("CENSUS_API_KEY"), - name.dictionaries, - surname.only = FALSE, - census.data = NULL, - retry = 0, - impute.missing = TRUE, - census.surname = FALSE, - use.counties = FALSE, - race.init, - ctrl -) { - census.geo <- tolower(census.geo) - census.geo <- rlang::arg_match(census.geo) - +predict_race_me <- function(voter.file, names.to.use, year = "2020",age = FALSE, sex = FALSE, + census.geo, census.key, name.dictionaries, surname.only=FALSE, + census.data = NULL, retry = 0, impute.missing = TRUE, census.surname = FALSE, + return.unmatched = return.unmatched, use.counties = FALSE, race.init, ctrl) +{ if(!is.null(census.data)) { census_data_preflight(census.data, census.geo, year) } @@ -499,6 +483,9 @@ predict_race_me <- function( } ## Other quick checks... + if (!(census.geo %in% c("county", "tract","block_group", "block", "place"))) { + stop("census.geo must be either 'county', 'tract', 'block', 'block_group', or 'place'") + } stopifnot( all(!is.na(voter.file$surname)) ) @@ -528,7 +515,15 @@ predict_race_me <- function( ) ## level of geo estimation - geo_id_names <- c("state", determine_geo_id_names(census.geo)) + geo_id_names <- c("state", switch(census.geo, + "county" = c("county"), + "tract" = c("county", "tract"), + "block_group" = c("county", "tract", "block_group"), + "block" = c("county", "tract", "block"), + "place" = c("place"), + "zipcode" = c("zipcode") + )) + #race_pred_args[names(args_usr)] <- args_usr all_states <- unique(voter.file$state) @@ -540,31 +535,38 @@ predict_race_me <- function( if (ctrl$verbose) { message("Forming Pr(race | location) tables from census data...\n") } - - vars_ <- census_geo_api_names(year = year) - - N_rg <- purrr::map( + if(year == "2020") { + vars_ <- c( + pop_white = 'P2_005N', pop_black = 'P2_006N', + pop_aian = 'P2_007N', pop_asian = 'P2_008N', + pop_nhpi = 'P2_009N', pop_other = 'P2_010N', + pop_two = 'P2_011N', pop_hisp = 'P2_002N' + ) + } else { + vars_ <- c( + pop_white = 'P005003', pop_black = 'P005004', + pop_aian = 'P005005', pop_asian = 'P005006', + pop_nhpi = 'P005007', pop_other = 'P005008', + pop_two = 'P005009', pop_hisp = 'P005010' + ) + } + tmp_tabs <- lapply( census.data, function(x) { all_names <- names(x[[census.geo]]) - - if (any(c("P2_005N", "P005003") %in% all_names)) { - vars_ <- census_geo_api_names_legacy(year = year) - } - - totals <- x[[census.geo]][, match(c(geo_id_names, unlist(vars_)), all_names)] - - totals$r_whi <- rowSums(totals[, vars_[["r_whi"]], drop = FALSE]) # White population - totals$r_bla <- rowSums(totals[, vars_[["r_bla"]], drop = FALSE]) # Black population - totals$r_his <- rowSums(totals[, vars_[["r_his"]], drop = FALSE]) # Latino population - totals$r_asi <- rowSums(totals[, vars_[["r_asi"]], drop = FALSE]) # Asian + NH/PI population - totals$r_oth <- rowSums(totals[, vars_[["r_oth"]], drop = FALSE]) # AI/AN + Other + Mixed population - - totals <- totals[, -match(unlist(vars_), names(totals))] - totals + tmp <- x[[census.geo]][, c(geo_id_names, grep("P00|P2_0", all_names, value = TRUE))] + tmp$r_whi <- tmp[, vars_["pop_white"]] + tmp$r_bla <- tmp[, vars_["pop_black"]] + tmp$r_his <- tmp[, vars_["pop_hisp"]] + tmp$r_asi <- (tmp[, vars_["pop_asian"]] + tmp[, vars_["pop_nhpi"]]) + tmp$r_oth <- (tmp[, vars_["pop_aian"]] + tmp[, vars_["pop_other"]] + tmp[, vars_["pop_two"]]) + all_names <- names(tmp) + ## Totals + tmp_la <- tmp[, c(geo_id_names, grep("^r_", all_names, value = TRUE))] + return(list(tots = tmp_la)) } ) - N_rg <- dplyr::bind_rows(N_rg) + N_rg <- do.call(rbind, lapply(tmp_tabs, function(x) x$tots)) N_rg_geo <- do.call(paste, N_rg[, geo_id_names]) ## Subset to geo's in vf N_rg <- N_rg[N_rg_geo %in% geo_id, ] @@ -608,7 +610,7 @@ predict_race_me <- function( surname = last_c, first = first_c, middle = mid_c) - kw_names <- toupper(dplyr::pull(ntab, 1)) + kw_names <- toupper(ntab[, 1]) proc_names_vf <- .name_preproc(voter.file[[ntype]], c(kw_names)) u_vf_names <- unique(proc_names_vf) kw_in_vf <- kw_names %in% proc_names_vf diff --git a/R/staticimports.R b/R/staticimports.R deleted file mode 100644 index dbb1efb..0000000 --- a/R/staticimports.R +++ /dev/null @@ -1,57 +0,0 @@ -# Generated by staticimports; do not edit by hand. -# ====================================================================== -# Imported from pkg:stringstatic -# ====================================================================== - -#' Duplicate and concatenate strings within a character vector -#' -#' Dependency-free drop-in alternative for `stringr::str_pad()`. -#' -#' @author Eli Pousson \email{eli.pousson@gmail.com} -#' ([ORCID](https://orcid.org/0000-0001-8280-1706)) -#' -#' Alexander Rossell Hayes \email{alexander@rossellhayes.com} -#' ([ORCID](https://orcid.org/0000-0001-9412-0457)) -#' -#' @source Adapted from the [stringr](https://stringr.tidyverse.org/) package. -#' -#' @param string Input vector. -#' Either a character vector, or something coercible to one. -#' @param width Minimum width of padded strings. -#' @param side Side on which padding character is added (left, right or both). -#' @param pad Single padding character (default is a space). -#' @param use_width If `FALSE`, -#' use the length of the string instead of the width; -#' see [str_width()]/[str_length()] for the difference. -#' -#' @return A character vector. -#' @noRd -str_pad <- function( - string, width, side = c("left", "right", "both"), pad = " ", use_width = TRUE -) { - if (!is.numeric(width)) { - return(string[NA]) - } - - if (any(nchar(pad, type = "width") != 1)) { - stop("each string in `pad` should consist of code points of total width 1") - } - - side <- match.arg(side) - - nchar_type <- if (isTRUE(use_width)) "width" else "chars" - string_width <- nchar(string, nchar_type) - pad_width <- width - string_width - pad_width[pad_width < 0] <- 0 - - switch( - side, - "left" = paste0(strrep(pad, pad_width), string), - "right" = paste0(string, strrep(pad, pad_width)), - "both" = paste0( - strrep(pad, floor(pad_width / 2)), - string, - strrep(pad, ceiling(pad_width / 2)) - ) - ) -} diff --git a/R/utils_assert.R b/R/utils_assert.R deleted file mode 100644 index 8d9e157..0000000 --- a/R/utils_assert.R +++ /dev/null @@ -1,37 +0,0 @@ -assert_boolean <- function( - x, - argument_name = rlang::caller_arg(x), - call = rlang::caller_call() -) { - if (length(x) != 1) { - cli::cli_abort( - c( - "{.arg {argument_name}} must be a {.code TRUE} or {.code FALSE} value of length {.val {1}}.", - x = "{.arg {argument_name}} has a length of {.val {length(x)}}." - ), - call = call - ) - } - - if (!inherits(x, "logical")) { - cli::cli_abort( - c( - "{.arg {argument_name}} must be a {.class logical} {.code TRUE} or {.code FALSE} value.", - x = "{.arg {argument_name}} is an object of class {.cls {class(x)}}." - ), - call = call - ) - } - - if (!x %in% c(TRUE, FALSE)) { - cli::cli_abort( - c( - "{.arg {argument_name}} must be {.code TRUE} or {.code FALSE}.", - x = "{.arg {argument_name}} is {.val {x}}." - ), - call = call - ) - } - - x -} diff --git a/R/utils_determine_geo_id_names.R b/R/utils_determine_geo_id_names.R deleted file mode 100644 index 4df154a..0000000 --- a/R/utils_determine_geo_id_names.R +++ /dev/null @@ -1,10 +0,0 @@ -determine_geo_id_names <- function(census.geo) { - switch( - census.geo, - "tract" = c("county", "tract"), - "block_group" = c("county", "tract", "block_group"), - "block" = c("county", "tract", "block"), - # Return `census.geo` unchanged for county, place, and zcta - census.geo - ) -} diff --git a/R/utils_state_fips.R b/R/utils_state_fips.R deleted file mode 100644 index 2d00150..0000000 --- a/R/utils_state_fips.R +++ /dev/null @@ -1,78 +0,0 @@ -#' Dataset with FIPS codes for US states -#' -#' Dataset including FIPS codes and postal abbreviations for each U.S. state, -#' district, and territory. -#' -#' @format -#' A tibble with 57 rows and 3 columns: -#' \describe{ -#' \item{`state`}{Two-letter postal abbreviation} -#' \item{`state_code`}{Two-digit FIPS code} -#' \item{`state_name`}{English name} -#' } -#' @source Derived from [tidycensus::fips_codes()] -"state_fips" - -#' Convert between state names, postal abbreviations, and FIPS codes -#' -#' @param x A [numeric] or [character] vector of state names, -#' postal abbreviations, or FIPS codes. -#' Matches for state names and abbreviations are not case sensitive. -#' FIPS codes may be matched from numeric or character vectors, -#' with or without leading zeroes. -#' -#' @return -#' \describe{ -#' \item{`as_state_fips_code()`}{ -#' A [character] vector of two-digit FIPS codes. -#' One-digit FIPS codes are prefixed with a leading zero, -#' e.g., `"06"` for California. -#' } -#' \item{`as_state_abbreviation()`}{ -#' A [character] vector of two-letter postal abbreviations, -#' e.g., `"CA"` for California. -#' } -#' } -#' -#' @examples -#' as_fips_code("california") -#' as_state_abbreviation("california") -#' -#' # Character vector matches ignore case -#' as_fips_code(c("DC", "Md", "va")) -#' as_state_abbreviation(c("district of columbia", "Maryland", "VIRGINIA")) -#' -#' # Note that `3` and `7` are standardized to `NA`, -#' # because no state is assigned those FIPS codes -#' as_fips_code(1:10) -#' as_state_abbreviation(1:10) -#' -#' # You can even mix methods in the same vector -#' as_fips_code(c("utah", "NM", 8, "04")) -#' as_state_abbreviation(c("utah", "NM", 8, "04")) -#' -#' @keywords internal -#' @export -as_fips_code <- function(x) { - state_fips <- wru::state_fips - state_fips$state_code[ - dplyr::coalesce( - match(toupper(x), state_fips$state), - match(tolower(x), tolower(state_fips$state_name)), - match(suppressWarnings(as.numeric(x)), as.numeric(state_fips$state_code)) - ) - ] -} - -#' @rdname as_fips_code -#' @export -as_state_abbreviation <- function(x) { - state_fips <- wru::state_fips - state_fips$state[ - dplyr::coalesce( - match(toupper(x), state_fips$state), - match(tolower(x), tolower(state_fips$state_name)), - match(suppressWarnings(as.numeric(x)), as.numeric(state_fips$state_code)) - ) - ] -} \ No newline at end of file diff --git a/R/utils_validate_key.R b/R/utils_validate_key.R deleted file mode 100644 index a92d447..0000000 --- a/R/utils_validate_key.R +++ /dev/null @@ -1,48 +0,0 @@ -#' @importFrom rlang %||% -validate_key <- function( - key, - argument_name = rlang::caller_arg(key), - call = rlang::caller_call() -) { - key <- key %||% Sys.getenv("CENSUS_API_KEY") - - if (length(key) != 1) { - cli::cli_abort( - c( - "{.arg {argument_name}} must be a {.cls character} string of length {.val {1}}.", - x = "{.arg {argument_name}} has a length of {.val {length(key)}}." - ), - call = call - ) - } - - if (!inherits(key, "character")) { - cli::cli_abort( - c( - "{.arg {argument_name}} must be a {.cls character} string of length {.val {1}}.", - x = "{.arg {argument_name}} is an object of class {.cls {class(key)}}." - ), - call = call - ) - } - - if (!nzchar(key)) { - cli::cli_abort( - c( - "{.arg {argument_name}} must not be an empty string.", - "*" = "Have you set the {.envvar CENSUS_API_KEY} environment variable? - See {.help wru::get_census_data} for more information." - ), - call = call - ) - } - - if (is.na(key)) { - cli::cli_abort( - "{.arg {argument_name}} must not be {.val {NA_character_}}.", - call = call - ) - } - - key -} diff --git a/R/wru-internal.R b/R/wru-internal.R index 1025a18..209921b 100644 --- a/R/wru-internal.R +++ b/R/wru-internal.R @@ -1,10 +1,7 @@ -.onAttach <- function(libname, pkgname) { - packageStartupMessage( - "\n", - "Please cite as:", "\n\n", - format(utils::citation("wru"), style = "text"), "\n\n", - "Note that wru 2.0.0 uses 2020 census data by default.", "\n", - 'Use the argument `year = "2010"`, to replicate analyses produced with earlier package versions.', - "\n" - ) +.onAttach <- +function(libname, pkgname) { + packageStartupMessage("\nPlease cite as: \n") + packageStartupMessage("Khanna K, Bertelsen B, Olivella S, Rosenman E, Imai K (2022). wru: Who are You?") + packageStartupMessage("Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and Geolocation.") + packageStartupMessage("URL: https://CRAN.R-project.org/package=wru \n") } diff --git a/README.Rmd b/README.Rmd deleted file mode 100644 index ccf678a..0000000 --- a/README.Rmd +++ /dev/null @@ -1,225 +0,0 @@ ---- -output: github_document ---- - - - -```{r, include = FALSE} -library(wru) -options(width = 10000) -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - fig.path = "man/figures/README-", - out.width = "100%", - message = FALSE -) - -voters <- dplyr::select(wru::voters, -precinct, -first, -last) -``` - -# wru: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation Package logo - -[![R-CMD-check](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/wru)](https://cran.r-project.org/package=wru) ![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/wru) - -This R package implements the methods proposed in Imai, K. and Khanna, K. (2016). "[Improving Ecological Inference by Predicting Individual Ethnicity from Voter Registration Record](http://imai.princeton.edu/research/race.html)." Political Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. [doi: 10.1093/pan/mpw001](https://dx.doi.org/10.1093/pan/mpw001). - -## Installation - -You can install the released version of **wru** from [CRAN](https://cran.r-project.org/package=wru) with: - -``` r -install.packages("wru") -``` - -Or you can install the development version of **wru** from [GitHub](https://github.com/kosukeimai/wru) with: - -``` r -# install.packages("pak") -pak::pkg_install("kosukeimai/wru") -``` - -## Using wru - -Here is a simple example that predicts the race/ethnicity of voters based only on their surnames. - -``` r -library(wru) -future::plan(future::multisession) -predict_race(voter.file = voters, surname.only = TRUE) -``` - -The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., `pred.his` is the probability of being Hispanic/Latino): - -``` - VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.045110474 0.003067623 0.0068522723 0.860411906 0.084557725 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.052645440 0.001334812 0.0558160072 0.719376581 0.170827160 - 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.043285692 0.008204605 0.9136195794 0.024316883 0.010573240 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.895405704 0.001911388 0.0337464844 0.011079323 0.057857101 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.006572555 0.001298962 0.0005388581 0.982365594 0.009224032 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.861236727 0.008212824 0.0095395642 0.011334635 0.109676251 - 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.543815322 0.344128607 0.0272403940 0.007405765 0.077409913 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.038939877 0.004920643 0.9318797791 0.012154125 0.012105576 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.330697188 0.194700665 0.4042849478 0.021379541 0.048937658 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.866360147 0.044429853 0.0246568086 0.010219712 0.054333479 -``` - -### Using geolocation - -In order to predict race/ethnicity based on surnames *and* geolocation, a user needs to provide a valid U.S. Census API key to access the census statistics. -You can request a U.S. Census API key from [the U.S. Census API key signup page](http://api.census.gov/data/key_signup.html). -Once you have an API key, you can use the package to download relevant Census geographic data on demand and condition race/ethnicity predictions on geolocation (county, tract, block, or place). - -First, you should save your census key to your `.Rprofile` or `.Renviron`. Below is an example procedure: - -``` -usethis::edit_r_environ() -# Edit the file with the following: -CENSUS_API_KEY=YourKey -# Save and close the file -# Restart your R session -``` - -The following example predicts the race/ethnicity of voters based on their surnames, census tract of residence (`census.geo = "tract"`), and party registration (`party = "PID"`). -Note that a valid API key must be stored in a `CENSUS_API_KEY` environment variable or provided with the `census.key` argument in order for the function to download the relevant tract-level data. - -``` r -library(wru) -predict_race(voter.file = voters, census.geo = "tract", party = "PID") -``` -``` - VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.021711601 0.0009552652 2.826779e-03 0.93364592 0.040860431 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.015364583 0.0002320815 9.020240e-03 0.90245186 0.072931231 - 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.092415538 0.0047099965 7.860806e-01 0.09924761 0.017546300 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.854810748 0.0010870744 1.783931e-02 0.04546436 0.080798514 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001548762 0.0001823506 7.031116e-05 0.99501901 0.003179566 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.852374629 0.0052590592 8.092435e-03 0.02529163 0.108982246 - 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.831282563 0.0613242553 1.059715e-02 0.01602557 0.080770461 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.062022518 0.0046691402 8.218906e-01 0.08321206 0.028205698 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.396500218 0.1390722877 2.684107e-01 0.11018413 0.085832686 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.861168219 0.0498449102 1.131154e-02 0.01633532 0.061340015 -``` - -In `predict_race()`, the `census.geo` options are "county", "tract", "block" and "place". -Here is an example of prediction based on census statistics collected at the level of "place": - -``` r -predict_race(voter.file = voters, census.geo = "place", party = "PID") -``` -``` - VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.042146148 0.0620484276 9.502254e-02 0.55109761 0.249685278 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.018140322 0.0002204255 1.026018e-02 0.90710894 0.064270133 - 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.015528660 0.0092292671 9.266893e-01 0.04182290 0.006729825 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.879537890 0.0008997896 1.768379e-02 0.03982601 0.062052518 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001819394 0.0001723242 7.957542e-05 0.99514078 0.002787926 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.834942701 0.0038157857 4.933723e-03 0.04021245 0.116095337 - 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.290386744 0.5761904554 4.112613e-02 0.01895885 0.073337820 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.065321588 0.0039558641 8.339387e-01 0.07461133 0.022172551 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.428723819 0.1209683869 2.796062e-01 0.10142953 0.069272098 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.716211008 0.1899554127 1.867133e-02 0.01025241 0.064909839 -``` - -### Downloading census data - -It is also possible to pre-download Census geographic data, which can save time when running `predict_race()`. -The example dataset `voters` includes people in DC, NJ, and NY. -The following example subsets voters in DC and NJ, and then uses `get_census_data()` to download census geographic data in these two states (a valid API key must be stored in a `CENSUS_API_KEY` environment variable or provided with the `key` argument). -Census data is assigned to an object named `census.dc.nj`. -The `predict_race()` statement predicts the race/ethnicity of voters in DC and NJ using the pre-downloaded census data (`census.data = census.dc.nj`). This example conditions race/ethnicity predictions on voters' surnames, block of residence (`census.geo = "block"`), age (`age = TRUE`), and party registration (`party = "PID"`). - -Please note that the input parameters `age` and `sex` must have the same values in `get_census_data()` and `predict_race()`, i.e., `TRUE` in both or `FALSE` in both. -In this case, predictions are conditioned on age but not sex, so `age = TRUE` and `sex = FALSE` in both the `get_census_data()` and `predict_race()` statements. - -``` r -library(wru) -voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] -census.dc.nj <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE) -predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") -``` - -This produces the same result as the following statement, which downloads census data during evaluation rather than using pre-downloaded data: - -``` r -predict_race(voter.file = voters.dc.nj, census.geo = "block", age = TRUE, sex = FALSE, party = "PID") -``` - -Using pre-downloaded Census data may be useful for the following reasons: - -- You can save a lot of time in future runs of `predict_race()` if the relevant census data has already been saved; -- The machines used to run `predict_race()` may not have internet access; -- You can obtain timely snapshots of census geographic data that match your voter file. - -Downloading data using `get_census_data()` may take a long time, especially in large states or when using small geographic levels. -If block-level census data is not required, downloading census data at the tract level will save time. -Similarly, if tract-level data is not required, county-level data may be specified in order to save time. - -``` r -library(wru) -voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] -census.dc.nj2 <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE, census.geo = "tract") -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) -``` - -#### Interact directly with the Census API - -You can use `census_geo_api()` to manually construct a census object. -The example below creates a census object with county-level and tract-level data in DC and NJ, while avoiding downloading block-level data. -Note that the `state` argument requires a vector of two-letter state abbreviations. - -``` r -census.dc.nj3 = list() - -county.dc <- census_geo_api(state = "DC", geo = "county", age = TRUE, sex = FALSE) -tract.dc <- census_geo_api(state = "DC", geo = "tract", age = TRUE, sex = FALSE) -census.dc.nj3[["DC"]] <- list(state = "DC", county = county.dc, tract = tract.dc, age = TRUE, sex = FALSE) - -tract.nj <- census_geo_api(state = "NJ", geo = "tract", age = TRUE, sex = FALSE) -county.nj <- census_geo_api(state = "NJ", geo = "county", age = TRUE, sex = FALSE) -census.dc.nj3[["NJ"]] <- list(state = "NJ", county = county.nj, tract = tract.nj, age = TRUE, sex = FALSE) -``` - -Note: The age and sex parameters must be consistent when creating the Census object and using that Census object in the predict_race function. If one of these parameters is TRUE in the Census object, it must also be TRUE in the predict_race function. - -After saving the data in censusObj2 above, we can condition race/ethnicity predictions on different combinations of input variables, without having to re-download the relevant Census data. - -``` r -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) -``` - -### Parallelization - -For larger scale imputations, garbage collection can become a problem and your machine(s) can quickly run out of memory (RAM). -We recommended using the `future.callr::callr` plan instead of `future::multisession`. -The `callr` plan instantiates a new session at every iteration of your parallel loop or map. -Although this has the negative effect of creating more overhead, it also clears sticky memory elements that can grow to eventual system failure when using `multisession`. -You end up with a process that is more stable, but slightly slower. - -``` r -library(wru) -future::plan(future.callr::callr) -# ... -``` - -```{comment} -Add guidance for starting a new analysis vs. replicating a previous analysis -``` - -## Census Data - -This package uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau. - -U.S. Census Bureau (2021, October 8). Decennial Census API. Census.gov. Retrieved from - -## A related song - -[![Thumbnail of the music video for "Who Are You" by The Who](https://img.youtube.com/vi/PNbBDrceCy8/maxresdefault.jpg)](https://www.youtube.com/watch?v=PNbBDrceCy8) diff --git a/README.md b/README.md index 45ddaa3..18d298f 100644 --- a/README.md +++ b/README.md @@ -1,183 +1,108 @@ +# wru: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation [![R-CMD-check](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/wru)](https://cran.r-project.org/package=wru) ![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/wru) - +This R package implements the methods proposed in Imai, K. and Khanna, K. (2016). "[Improving Ecological Inference by Predicting Individual Ethnicity from Voter Registration Record.](http://imai.princeton.edu/research/race.html)" Political Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. doi: 10.1093/pan/mpw001. -# wru: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation Package logo +### Using wru -[![R-CMD-check](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml) -[![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/wru)](https://cran.r-project.org/package=wru) -![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/wru) +First, you should save your census key to your `.Rprofile` or `.Renviron`. Below is an example procedure: -This R package implements the methods proposed in Imai, K. and Khanna, -K. (2016). “[Improving Ecological Inference by Predicting Individual -Ethnicity from Voter Registration -Record](http://imai.princeton.edu/research/race.html).” Political -Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. [doi: -10.1093/pan/mpw001](https://dx.doi.org/10.1093/pan/mpw001). + > usethis::edit_r_profile() + # edit the file with the following + Sys.setenv("CENSUS_API_KEY" = "Your Key") + # save and close the file + # Restart your R session -## Installation +Now, here is a simple example that predicts the race/ethnicity of voters based only on their surnames. -You can install the released version of **wru** from -[CRAN](https://cran.r-project.org/package=wru) with: +Note: Sample data `voters` is based on 2010 Census, therefore we fix that variable here. Default +is the latest Census release year (2020). See `get_census_data()` to retrieve Census data related to your +data set. If you are getting an error similar to setting `year = 2020`, this is what needs to be +addressed. ``` r -install.packages("wru") +library(wru) +future::plan(future::multisession) +data(voters) +predict_race(voter.file = voters, surname.only = T, year = 2010) ``` -Or you can install the development version of **wru** from -[GitHub](https://github.com/kosukeimai/wru) with: +The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., 'pred.his' is the probability of being Hispanic/Latino): -``` r -# install.packages("pak") -pak::pkg_install("kosukeimai/wru") -``` + "Proceeding with surname-only predictions ..." + VoterID surname state CD county tract block precinct age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 6 29 0 Ind 0 74000 0.0676000 0.00430000 0.00820000 0.86680000 0.05310000 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.0812000 0.00240000 0.06890000 0.73750000 0.11000000 + 3 Velasco NY 12 061 004800 6001 33 0 Rep 2 51000 0.0594000 0.00260000 0.82270000 0.10510000 0.01020000 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.9355936 0.00220022 0.02850285 0.00780078 0.02590259 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.0098000 0.00180000 0.00065000 0.98200000 0.00575000 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.9187000 0.01083333 0.01083333 0.01083333 0.04880000 + 7 Johnson NY 9 061 015100 4000 25 0 Dem 1 51000 0.5897000 0.34630000 0.02360000 0.00540000 0.03500000 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.0486000 0.00570000 0.92920000 0.01020000 0.00630000 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.6665000 0.08530000 0.13670000 0.07970000 0.03180000 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.9054000 0.04310000 0.02060000 0.00720000 0.02370000 -## Using wru +In order to predict race/ethnicity based on surnames AND geolocation, a user needs to provide a valid U.S. Census API key to access the census statistics. You may request a U.S. Census API key [here](http://api.census.gov/data/key_signup.html). Once you have an API key, you can use the package to download relevant Census geographic data on demand and condition race/ethnicity predictions on geolocation (county, tract, block, or place). -Here is a simple example that predicts the race/ethnicity of voters -based only on their surnames. +The following example predicts the race/ethnicity of voters based on their surnames, Census tract of residence (census.geo = "tract"), and which party registration (party = "PID"). Note that a valid API key must be provided in the input parameter 'census.key' in order for the function to download the relevant tract-level data. ``` r library(wru) -future::plan(future::multisession) -predict_race(voter.file = voters, surname.only = TRUE) +data(voters) +predict_race(voter.file = voters, year = 2010, census.geo = "tract", census.key = "...", party = "PID") ``` -The above produces the following output, where the last five columns are -probabilistic race/ethnicity predictions (e.g., `pred.his` is the -probability of being Hispanic/Latino): - - VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.045110474 0.003067623 0.0068522723 0.860411906 0.084557725 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.052645440 0.001334812 0.0558160072 0.719376581 0.170827160 - 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.043285692 0.008204605 0.9136195794 0.024316883 0.010573240 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.895405704 0.001911388 0.0337464844 0.011079323 0.057857101 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.006572555 0.001298962 0.0005388581 0.982365594 0.009224032 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.861236727 0.008212824 0.0095395642 0.011334635 0.109676251 - 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.543815322 0.344128607 0.0272403940 0.007405765 0.077409913 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.038939877 0.004920643 0.9318797791 0.012154125 0.012105576 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.330697188 0.194700665 0.4042849478 0.021379541 0.048937658 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.866360147 0.044429853 0.0246568086 0.010219712 0.054333479 - -### Using geolocation - -In order to predict race/ethnicity based on surnames *and* geolocation, -a user needs to provide a valid U.S. Census API key to access the census -statistics. You can request a U.S. Census API key from [the U.S. Census -API key signup page](http://api.census.gov/data/key_signup.html). Once -you have an API key, you can use the package to download relevant Census -geographic data on demand and condition race/ethnicity predictions on -geolocation (county, tract, block, or place). - -First, you should save your census key to your `.Rprofile` or -`.Renviron`. Below is an example procedure: - - usethis::edit_r_environ() - # Edit the file with the following: - CENSUS_API_KEY=YourKey - # Save and close the file - # Restart your R session - -The following example predicts the race/ethnicity of voters based on -their surnames, census tract of residence (`census.geo = "tract"`), and -party registration (`party = "PID"`). Note that a valid API key must be -stored in a `CENSUS_API_KEY` environment variable or provided with the -`census.key` argument in order for the function to download the relevant -tract-level data. +The above returns the following output. -``` r -library(wru) -predict_race(voter.file = voters, census.geo = "tract", party = "PID") -``` + VoterID surname state CD county tract block precinct age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 6 29 0 Ind 0 74000 0.081856291 0.0021396565 0.0110451405 0.828313291 0.076645621 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.916936771 0.0044432219 0.0120276229 0.008532929 0.058059455 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.895620643 0.0022078678 0.0139457411 0.023345853 0.064879895 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.003164229 0.0006092345 0.0001072684 0.991261466 0.004857802 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.029936354 0.0009275220 0.0129831039 0.850040743 0.106112277 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.231046860 0.0016485574 0.6813780115 0.053180270 0.032746301 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.817841573 0.0063677130 0.0258733496 0.107254103 0.042663261 + 3 Velasco NY 12 061 004800 6001 33 0 Rep 2 51000 0.223924118 0.0002913000 0.4451163607 0.313431417 0.017236805 + 7 Johnson NY 9 061 015100 4000 25 0 Dem 1 51000 0.241417483 0.6900686166 0.0293556870 0.011105140 0.028053073 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.983300770 0.0006116706 0.0034070782 0.004823439 0.007857042 - VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.021711601 0.0009552652 2.826779e-03 0.93364592 0.040860431 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.015364583 0.0002320815 9.020240e-03 0.90245186 0.072931231 - 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.092415538 0.0047099965 7.860806e-01 0.09924761 0.017546300 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.854810748 0.0010870744 1.783931e-02 0.04546436 0.080798514 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001548762 0.0001823506 7.031116e-05 0.99501901 0.003179566 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.852374629 0.0052590592 8.092435e-03 0.02529163 0.108982246 - 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.831282563 0.0613242553 1.059715e-02 0.01602557 0.080770461 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.062022518 0.0046691402 8.218906e-01 0.08321206 0.028205698 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.396500218 0.1390722877 2.684107e-01 0.11018413 0.085832686 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.861168219 0.0498449102 1.131154e-02 0.01633532 0.061340015 - -In `predict_race()`, the `census.geo` options are “county”, “tract”, -“block” and “place”. Here is an example of prediction based on census -statistics collected at the level of “place”: +In predict_race, the census.geo options are "county", "tract", "block" and "place". Here is an example of prediction based on census statistics collected at the level of "place": ``` r -predict_race(voter.file = voters, census.geo = "place", party = "PID") +data(voters) +predict_race(voter.file = voters, census.geo = "place", year = 2010, census.key = "...", party = "PID") ``` - VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.042146148 0.0620484276 9.502254e-02 0.55109761 0.249685278 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.018140322 0.0002204255 1.026018e-02 0.90710894 0.064270133 - 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.015528660 0.0092292671 9.266893e-01 0.04182290 0.006729825 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.879537890 0.0008997896 1.768379e-02 0.03982601 0.062052518 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001819394 0.0001723242 7.957542e-05 0.99514078 0.002787926 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.834942701 0.0038157857 4.933723e-03 0.04021245 0.116095337 - 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.290386744 0.5761904554 4.112613e-02 0.01895885 0.073337820 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.065321588 0.0039558641 8.339387e-01 0.07461133 0.022172551 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.428723819 0.1209683869 2.796062e-01 0.10142953 0.069272098 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.716211008 0.1899554127 1.867133e-02 0.01025241 0.064909839 - -### Downloading census data - -It is also possible to pre-download Census geographic data, which can -save time when running `predict_race()`. The example dataset `voters` -includes people in DC, NJ, and NY. The following example subsets voters -in DC and NJ, and then uses `get_census_data()` to download census -geographic data in these two states (a valid API key must be stored in a -`CENSUS_API_KEY` environment variable or provided with the `key` -argument). Census data is assigned to an object named `census.dc.nj`. -The `predict_race()` statement predicts the race/ethnicity of voters in -DC and NJ using the pre-downloaded census data -(`census.data = census.dc.nj`). This example conditions race/ethnicity -predictions on voters’ surnames, block of residence -(`census.geo = "block"`), age (`age = TRUE`), and party registration -(`party = "PID"`). - -Please note that the input parameters `age` and `sex` must have the same -values in `get_census_data()` and `predict_race()`, i.e., `TRUE` in both -or `FALSE` in both. In this case, predictions are conditioned on age but -not sex, so `age = TRUE` and `sex = FALSE` in both the -`get_census_data()` and `predict_race()` statements. +It is also possible to pre-download Census geographic data, which can save time when running predict_race(). The example dataset 'voters' includes people in DC, NJ, and NY. The following example subsets voters in DC and NJ, and then uses get_census_data() to download Census geographic data in these two states (input parameter 'key' requires valid API key). Census data is assigned to an object named census.dc.nj. The predict_race() statement predicts the race/ethnicity of voters in DC and NJ using the pre-saved Census data (census.data = census.dc.nj). This example conditions race/ethnicity predictions on voters' surnames, block of residence (census.geo = "block"), age (age = TRUE), and party registration (party = "PID"). + +Please note that the input parameters 'age' and 'sex' must have the same values in get_census_data() and predict_race(), i.e., TRUE in both or FALSE in both. In this case, predictions are conditioned on age but not sex, so age = TRUE and sex = FALSE in both the get_census_data() and predict_race() statements. ``` r library(wru) -voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] -census.dc.nj <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE) +data(voters) +voters.dc.nj <- voters[c(-3, -7), ] # remove two NY cases from dataset +census.dc.nj <- get_census_data(key = "...", state = c("DC", "NJ"), age = TRUE, sex = FALSE) # create Census data object covering DC and NJ predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") ``` -This produces the same result as the following statement, which -downloads census data during evaluation rather than using pre-downloaded -data: +The last two lines above are equivalent to the following: ``` r -predict_race(voter.file = voters.dc.nj, census.geo = "block", age = TRUE, sex = FALSE, party = "PID") +predict_race(voter.file = voters.dc.nj, census.geo = "block", year = 2010, census.key = "...", age = TRUE, sex = FALSE, party = "PID") ``` -Using pre-downloaded Census data may be useful for the following -reasons: +Using pre-downloaded Census data may be useful for the following reasons: -- You can save a lot of time in future runs of `predict_race()` if the - relevant census data has already been saved; -- The machines used to run `predict_race()` may not have internet - access; -- You can obtain timely snapshots of census geographic data that match - your voter file. +- You can save a lot of time in future runs of predict_race() if the relevant Census data has already been saved; +- The machines used to run predict_race() may not have internet access; +- You can obtain timely snapshots of Census geographic data that match your voter file. -Downloading data using `get_census_data()` may take a long time, -especially in large states or when using small geographic levels. If -block-level census data is not required, downloading census data at the -tract level will save time. Similarly, if tract-level data is not -required, county-level data may be specified in order to save time. +Downloading data using get_census_data() may take a long time, especially at the block level or in large states. If block-level Census data is not required, downloading Census data at the tract level will save time. Similarly, if tract-level Census data is not required, county-level data may be specified in order to save time. ``` r library(wru) -voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] -census.dc.nj2 <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE, census.geo = "tract") +data(voters) +voters.dc.nj <- voters[c(-3, -7), ] # remove two NY cases from dataset +census.dc.nj2 <- get_census_data(key = "...", state = c("DC", "NJ"), age = TRUE, sex = FALSE, census.geo = "tract") predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) @@ -185,69 +110,45 @@ predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = cen predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) ``` -#### Interact directly with the Census API - -You can use `census_geo_api()` to manually construct a census object. -The example below creates a census object with county-level and -tract-level data in DC and NJ, while avoiding downloading block-level -data. Note that the `state` argument requires a vector of two-letter -state abbreviations. +Or you can also use the census_geo_api() to manually construct a census object. The example below creates a census object with county-level and tract-level data in DC and NJ, while avoiding downloading block-level data. Note that this function has the input parameter 'state' that requires a two-letter state abbreviation to proceed. ``` r -census.dc.nj3 = list() +censusObj2 = list() -county.dc <- census_geo_api(state = "DC", geo = "county", age = TRUE, sex = FALSE) -tract.dc <- census_geo_api(state = "DC", geo = "tract", age = TRUE, sex = FALSE) -census.dc.nj3[["DC"]] <- list(state = "DC", county = county.dc, tract = tract.dc, age = TRUE, sex = FALSE) +county.dc <- census_geo_api(key = "...", state = "DC", geo = "county", age = TRUE, sex = FALSE) +tract.dc <- census_geo_api(key = "...", state = "DC", geo = "tract", age = TRUE, sex = FALSE) +censusObj2[["DC"]] <- list(state = "DC", county = county.dc, tract = tract.dc, age = TRUE, sex = FALSE) -tract.nj <- census_geo_api(state = "NJ", geo = "tract", age = TRUE, sex = FALSE) -county.nj <- census_geo_api(state = "NJ", geo = "county", age = TRUE, sex = FALSE) -census.dc.nj3[["NJ"]] <- list(state = "NJ", county = county.nj, tract = tract.nj, age = TRUE, sex = FALSE) +tract.nj <- census_geo_api(key = "...", state = "NJ", geo = "tract", age = TRUE, sex = FALSE) +county.nj <- census_geo_api(key = "...", state = "NJ", geo = "county", age = TRUE, sex = FALSE) +censusObj2[["NJ"]] <- list(state = "NJ", county = county.nj, tract = tract.nj, age = TRUE, sex = FALSE) ``` -Note: The age and sex parameters must be consistent when creating the -Census object and using that Census object in the predict_race function. -If one of these parameters is TRUE in the Census object, it must also be -TRUE in the predict_race function. +Note: The age and sex parameters must be consistent when creating the Census object and using that Census object in the predict_race function. If one of these parameters is TRUE in the Census object, it must also be TRUE in the predict_race function. -After saving the data in censusObj2 above, we can condition -race/ethnicity predictions on different combinations of input variables, -without having to re-download the relevant Census data. +After saving the data in censusObj2 above, we can condition race/ethnicity predictions on different combinations of input variables, without having to re-download the relevant Census data. ``` r -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = censusObj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = censusObj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) ``` -### Parallelization +## Notes about process design -For larger scale imputations, garbage collection can become a problem -and your machine(s) can quickly run out of memory (RAM). We recommended -using the `future.callr::callr` plan instead of `future::multisession`. -The `callr` plan instantiates a new session at every iteration of your -parallel loop or map. Although this has the negative effect of creating -more overhead, it also clears sticky memory elements that can grow to -eventual system failure when using `multisession`. You end up with a -process that is more stable, but slightly slower. +For larger scale imputations garbage-collection can become a problem and your machine(s) can quickly run out of memory (RAM). It is recommended to use the `future.callr::callr` plan instead of `future::multisession`. The `callr` plan instantiates a new session at every iteration of your parallel loop or map. This simultaneously has the effect of creating more overhead, but also clearing the often sticky memory elements that would be left over to grow to eventual system failure when using `multisession`. You end up with a process that is more stable, but slightly slower. -``` r -library(wru) -future::plan(future.callr::callr) -# ... -``` + library(wru) + future::plan(future.callr::callr) + # ... ## Census Data -This package uses the Census Bureau Data API but is not endorsed or -certified by the Census Bureau. +This package uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau. -U.S. Census Bureau (2021, October 8). Decennial Census API. Census.gov. -Retrieved from - +U.S. Census Bureau (2021, October 8). Decennial Census API. Census.gov. Retrieved from -## A related song +### A related song -[![Thumbnail of the music video for “Who Are You” by The -Who](https://img.youtube.com/vi/PNbBDrceCy8/maxresdefault.jpg)](https://www.youtube.com/watch?v=PNbBDrceCy8) +Watch [this](https://www.youtube.com/watch?v=LYb_nqU_43w)! diff --git a/cran-comments.md b/cran-comments.md index 25d6f18..858617d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -3,4 +3,3 @@ 0 errors | 0 warnings | 1 note * This is a new release. -* Note is related to file size which cannot be reduced. diff --git a/data-raw/state_fips.R b/data-raw/state_fips.R deleted file mode 100644 index ff2fb7c..0000000 --- a/data-raw/state_fips.R +++ /dev/null @@ -1,10 +0,0 @@ -library(dplyr) -library(tibble) -library(tidycensus) - -state_fips <- tidycensus::fips_codes |> - tibble::as_tibble() |> - dplyr::distinct(state, state_code, state_name) |> - tibble::remove_rownames() - -usethis::use_data(state_fips, overwrite = TRUE) diff --git a/data-raw/voters.R b/data-raw/voters.R deleted file mode 100644 index a224447..0000000 --- a/data-raw/voters.R +++ /dev/null @@ -1,19 +0,0 @@ -voters <- data.frame( - VoterID = as.character(1:10), - surname = c("Khanna", "Imai", "Rivera", "Fifield", "Zhou", "Ratkovic", "Johnson", "Lopez", "Wantchekon", "Morse"), - state = c("NJ", "NJ", "NY", "NJ", "NJ", "NJ", "NY", "NJ", "NJ", "DC"), - CD = c("12", "12", "12", "12", "12", "12", "9", "12", "12", "0"), - county = c("021", "021", "061", "021", "021", "021", "061", "021", "021", "001"), - tract = c("004000", "004501", "004800", "004501", "004501", "004000", "014900", "004501", "004501", "001301"), - block = c("3001", "1025", "6001", "1025", "1025", "1025", "4000", "1025", "1025", "3005"), - precinct = c("6", "", "", "", "", "", "", "", "", ""), - age = c(29, 40, 33, 27, 28, 35, 25, 33, 50, 29), - sex = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1), - party = c("Ind", "Dem", "Rep", "Dem", "Dem", "Ind", "Dem", "Rep", "Rep", "Rep"), - PID = c("0", "1", "2", "1", "1", "0", "1", "2", "2", "2"), - place = c("74000", "60900", "51000", "60900", "60900", "60900", "51000", "60900", "60900", "50000"), - last = c("Khanna", "Imai", "Rivera", "Fifield", "Zhou", "Ratkovic", "Johnson", "Lopez", "Wantchekon", "Morse"), - first = c("Kabir", "Kosuke", "Carlos", "Ben", "Yang-Yang", "Marc", "Frank", "Gabriel", "Leonard", "Julia") -) - -usethis::use_data(voters, overwrite = TRUE) diff --git a/data/state_fips.rda b/data/state_fips.rda deleted file mode 100644 index 5ee0070..0000000 Binary files a/data/state_fips.rda and /dev/null differ diff --git a/data/voters.RData b/data/voters.RData new file mode 100644 index 0000000..ffe9973 Binary files /dev/null and b/data/voters.RData differ diff --git a/data/voters.rda b/data/voters.rda deleted file mode 100644 index bfa686b..0000000 Binary files a/data/voters.rda and /dev/null differ diff --git a/inst/CITATION b/inst/CITATION deleted file mode 100644 index 159fa69..0000000 --- a/inst/CITATION +++ /dev/null @@ -1,14 +0,0 @@ -bibentry( - bibtype = "Manual", - title = "wru: Who are You? Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and Geolocation", - author = c( - person("Kabir", "Khanna", , "kabirkhanna@gmail.com", role = "aut"), - person("Brandon", "Bertelsen", , "brandon@bertelsen.ca", role = c("aut", "cre")), - person("Santiago", "Olivella", , "olivella@unc.edu", role = "aut"), - person("Evan", "Rosenman", , "etrrosenman@gmail.com", role = "aut"), - person("Alexander", "Rossell Hayes", , "alexander@rossellhayes.com", role = "aut"), - person("Kosuke", "Imai", , "imai@harvard.edu", role = "aut") - ), - year = 2024, - url = "https://CRAN.R-project.org/package=wru" -) diff --git a/man/as_fips_code.Rd b/man/as_fips_code.Rd deleted file mode 100644 index 1e2c60c..0000000 --- a/man/as_fips_code.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_state_fips.R -\name{as_fips_code} -\alias{as_fips_code} -\alias{as_state_abbreviation} -\title{Convert between state names, postal abbreviations, and FIPS codes} -\usage{ -as_fips_code(x) - -as_state_abbreviation(x) -} -\arguments{ -\item{x}{A \link{numeric} or \link{character} vector of state names, -postal abbreviations, or FIPS codes. -Matches for state names and abbreviations are not case sensitive. -FIPS codes may be matched from numeric or character vectors, -with or without leading zeroes.} -} -\value{ -\describe{ -\item{\code{as_state_fips_code()}}{ -A \link{character} vector of two-digit FIPS codes. -One-digit FIPS codes are prefixed with a leading zero, -e.g., \code{"06"} for California. -} -\item{\code{as_state_abbreviation()}}{ -A \link{character} vector of two-letter postal abbreviations, -e.g., \code{"CA"} for California. -} -} -} -\description{ -Convert between state names, postal abbreviations, and FIPS codes -} -\examples{ -as_fips_code("california") -as_state_abbreviation("california") - -# Character vector matches ignore case -as_fips_code(c("DC", "Md", "va")) -as_state_abbreviation(c("district of columbia", "Maryland", "VIRGINIA")) - -# Note that `3` and `7` are standardized to `NA`, -# because no state is assigned those FIPS codes -as_fips_code(1:10) -as_state_abbreviation(1:10) - -# You can even mix methods in the same vector -as_fips_code(c("utah", "NM", 8, "04")) -as_state_abbreviation(c("utah", "NM", 8, "04")) - -} -\keyword{internal} diff --git a/man/census_data_preflight.Rd b/man/census_data_preflight.Rd index 5b656dd..f3512c8 100644 --- a/man/census_data_preflight.Rd +++ b/man/census_data_preflight.Rd @@ -7,29 +7,11 @@ census_data_preflight(census.data, census.geo, year) } \arguments{ -\item{census.data}{A list indexed by two-letter state abbreviations, -which contains pre-saved Census geographic data. -Can be generated using \code{get_census_data} function.} +\item{census.data}{See documentation in \code{race_predict}.} -\item{census.geo}{An optional character vector specifying what level of -geography to use to merge in U.S. Census geographic data. Currently -\code{"county"}, \code{"tract"}, \code{"block_group"}, \code{"block"}, and \code{"place"} -are supported. -Note: sufficient information must be in user-defined \code{\var{voter.file}} object. -If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} -must have column named \code{county}. -If \code{\var{census.geo} = "tract"}, then \code{\var{voter.file}} -must have columns named \code{county} and \code{tract}. -And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} -must have columns named \code{county}, \code{tract}, and \code{block}. -If \code{\var{census.geo} = "place"}, then \code{\var{voter.file}} -must have column named \code{place}. -If \code{census.geo = "zcta"}, then \code{voter.file} must have column named \code{zcta}. -Specifying \code{\var{census.geo}} will call \code{census_helper} function -to merge Census geographic data at specified level of geography.} +\item{census.geo}{See documentation in \code{race_predict}.} -\item{year}{An optional character vector specifying the year of U.S. Census geographic -data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}.} +\item{year}{See documentation in \code{race_predict}.} } \description{ Preflight census data diff --git a/man/census_geo_api.Rd b/man/census_geo_api.Rd index e679fa6..4c1f6ae 100644 --- a/man/census_geo_api.Rd +++ b/man/census_geo_api.Rd @@ -5,50 +5,46 @@ \title{Census Data download function.} \usage{ census_geo_api( - key = Sys.getenv("CENSUS_API_KEY"), + key = NULL, state, - geo = c("tract", "block", "block_group", "county", "place", "zcta"), + geo = "tract", age = FALSE, sex = FALSE, - year = c("2020", "2010"), + year = "2020", retry = 3, save_temp = NULL, counties = NULL ) } \arguments{ -\item{key}{A character string containing a valid Census API key, -which can be requested from the -\href{https://api.census.gov/data/key_signup.html}{U.S. Census API key signup page}. +\item{key}{A required character object. Must contain user's Census API +key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} -By default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} - -\item{state}{A required character object specifying which state to extract Census data for, +\item{state}{A required character object specifying which state to extract Census data for, e.g., \code{"NJ"}.} -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"block"}, \code{"block_group"}, \code{"county"}, \code{"place"}, \code{"tract"}, or \code{"zcta"}. +\item{geo}{A character object specifying what aggregation level to use. +Use \code{"county"}, \code{"tract"},\code{"block_group"}, \code{"block"}, or \code{"place"}. Default is \code{"tract"}. Warning: extracting block-level data takes very long.} -\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +If \code{TRUE}, function will return Pr(Geolocation, Age | Race). If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} -\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} \item{year}{A character object specifying the year of U.S. Census data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}. -Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and +Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and \code{\var{sex}} are both \code{FALSE}.} \item{retry}{The number of retries at the census website if network interruption occurs.} -\item{save_temp}{File indicating where to save the temporary outputs. +\item{save_temp}{File indicating where to save the temporary outputs. Defaults to NULL. If specified, the function will look for an .RData file with the same format as the expected output.} @@ -56,26 +52,26 @@ with the same format as the expected output.} Useful for smaller predictions where only a few counties are considered. Must be zero padded.} } \value{ -Output will be an object of class \code{list}, indexed by state names. It will -consist of the original user-input data with additional columns of Census geographic data. +Output will be an object of class \code{list}, indexed by state names. It will + consist of the original user-input data with additional columns of Census geographic data. } \description{ \code{census_geo_api} retrieves U.S. Census geographic data for a given state. } \details{ -This function allows users to download U.S. Census geographic data (2010 or 2020), +This function allows users to download U.S. Census geographic data (2010 or 2020), at either the county, tract, block, or place level, for a particular state. } \examples{ \dontshow{data(voters)} -\dontrun{census_geo_api(states = c("NJ", "DE"), geo = "block")} -\dontrun{census_geo_api(states = "FL", geo = "tract", age = TRUE, sex = TRUE)} -\dontrun{census_geo_api(states = "MA", geo = "place", age = FALSE, sex = FALSE, +\dontrun{census_geo_api(key = "...", states = c("NJ", "DE"), geo = "block")} +\dontrun{census_geo_api(key = "...", states = "FL", geo = "tract", age = TRUE, sex = TRUE)} +\dontrun{census_geo_api(key = "...", states = "MA", geo = "place", age = FALSE, sex = FALSE, year = "2020")} } \references{ -Relies on \code{get_census_api()}, \code{get_census_api_2()}, and \code{vec_to_chunk()} functions authored by Nicholas Nagle, +Relies on get_census_api, get_census_api_2, and vec_to_chunk functions authored by Nicholas Nagle, available \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. } \keyword{internal} diff --git a/man/census_geo_api_names.Rd b/man/census_geo_api_names.Rd deleted file mode 100644 index a03488c..0000000 --- a/man/census_geo_api_names.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/census_geo_api_names.R -\name{census_geo_api_names} -\alias{census_geo_api_names} -\alias{census_geo_api_url} -\title{Census geo API helper functions} -\usage{ -census_geo_api_names( - year = c("2020", "2010", "2000"), - age = FALSE, - sex = FALSE -) - -census_geo_api_url(year = c("2020", "2010", "2000")) -} -\arguments{ -\item{year}{A character object specifying the year of U.S. Census data to be downloaded. -Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}. -Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and -\code{\var{sex}} are both \code{FALSE}.} - -\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Age | Race). -If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} - -\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). -If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} -} -\value{ -\describe{ -\item{\code{census_geo_api_names()}}{ -A named list of \link{character} vectors whose values correspond to columns -of a Census API table and whose names represent the new columns they are -used to calculate in \code{\link[=census_geo_api]{census_geo_api()}}. -} -\item{\code{census_geo_api_url()}}{ -A \link{character} string containing the base of the URL to a -Census API table. -} -} -} -\description{ -Census geo API helper functions -} -\keyword{internal} diff --git a/man/census_helper.Rd b/man/census_helper.Rd index 74dcb2a..ef3c5ff 100644 --- a/man/census_helper.Rd +++ b/man/census_helper.Rd @@ -5,7 +5,7 @@ \title{Census helper function.} \usage{ census_helper( - key = Sys.getenv("CENSUS_API_KEY"), + key, voter.file, states = "all", geo = "tract", @@ -18,12 +18,8 @@ census_helper( ) } \arguments{ -\item{key}{A character string containing a valid Census API key, -which can be requested from the -\href{https://api.census.gov/data/key_signup.html}{U.S. Census API key signup page}. - -By default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} +\item{key}{A required character object. Must contain user's Census API +key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} \item{voter.file}{An object of class \code{data.frame}. Must contain field(s) named \code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}} @@ -72,8 +68,8 @@ If \code{\var{census.data}} is missing, Census geographic data will be obtained } \value{ Output will be an object of class \code{data.frame}. It will -consist of the original user-input data with additional columns of -Census data. + consist of the original user-input data with additional columns of + Census data. } \description{ \code{census_helper} links user-input dataset with Census geographic data. @@ -89,17 +85,17 @@ Pr(Geolocation | Race) where geolocation is county, tract, block, or place. data(voters) } \dontrun{ -census_helper(voter.file = voters, states = "nj", geo = "block") +census_helper(key = "...", voter.file = voters, states = "nj", geo = "block") } \dontrun{ census_helper( - voter.file = voters, states = "all", geo = "tract", + key = "...", voter.file = voters, states = "all", geo = "tract", age = TRUE, sex = TRUE ) } \dontrun{ census_helper( - voter.file = voters, states = "all", geo = "county", + key = "...", voter.file = voters, states = "all", geo = "county", age = FALSE, sex = FALSE, year = "2020" ) } diff --git a/man/census_helper_new.Rd b/man/census_helper_new.Rd index afa7599..e410267 100644 --- a/man/census_helper_new.Rd +++ b/man/census_helper_new.Rd @@ -5,40 +5,35 @@ \title{Census helper function.} \usage{ census_helper_new( - key = Sys.getenv("CENSUS_API_KEY"), + key, voter.file, states = "all", - geo = c("tract", "block", "block_group", "county", "place", "zcta"), + geo = "tract", age = FALSE, sex = FALSE, year = "2020", census.data = NULL, retry = 3, - use.counties = FALSE, - skip_bad_geos = FALSE + use.counties = FALSE ) } \arguments{ -\item{key}{A character string containing a valid Census API key, -which can be requested from the -\href{https://api.census.gov/data/key_signup.html}{U.S. Census API key signup page}. +\item{key}{A required character object. Must contain user's Census API +key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} -By default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} - -\item{voter.file}{An object of class \code{data.frame}. Must contain field(s) named +\item{voter.file}{An object of class \code{data.frame}. Must contain field(s) named \code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}} -specifying geolocation. These should be character variables that match up with -U.S. Census categories. County should be three characters (e.g., "031" not "31"), -tract should be six characters, and block should be four characters. +specifying geolocation. These should be character variables that match up with +U.S. Census categories. County should be three characters (e.g., "031" not "31"), +tract should be six characters, and block should be four characters. Place should be five characters if it is included.} -\item{states}{A character vector specifying which states to extract -Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts +\item{states}{A character vector specifying which states to extract +Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts Census data for all states contained in user-input data.} -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"}, \code{"block"}, or \code{"place"}. +\item{geo}{A character object specifying what aggregation level to use. +Use \code{"county"}, \code{"tract"}, \code{"block"}, or \code{"place"}. Default is \code{"tract"}. Warning: extracting block-level data takes very long.} \item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on @@ -54,43 +49,39 @@ If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Ag \item{year}{A character object specifying the year of U.S. Census data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}.} -\item{census.data}{A optional census object of class \code{list} containing +\item{census.data}{A optional census object of class \code{list} containing pre-saved Census geographic data. Can be created using \code{get_census_data} function. -If \code{\var{census.data}} is provided, the \code{\var{year}} element must -have the same value as the \code{\var{year}} option specified in this function -(i.e., \code{"2010"} in both or \code{"2020"} in both). -If \code{\var{census.data}} is provided, the \code{\var{age}} and the \code{\var{sex}} +If \code{\var{census.data}} is provided, the \code{\var{year}} element must +have the same value as the \code{\var{year}} option specified in this function +(i.e., \code{"2010"} in both or \code{"2020"} in both). +If \code{\var{census.data}} is provided, the \code{\var{age}} and the \code{\var{sex}} elements must be \code{FALSE}. This corresponds to the defaults of \code{census_geo_api}. If \code{\var{census.data}} is missing, Census geographic data will be obtained via Census API.} \item{retry}{The number of retries at the census website if network interruption occurs.} -\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties +\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}?} - -\item{skip_bad_geos}{Logical. Option to have the function skip any geolocations that are not present -in the census data, returning a partial data set. Default is set to \code{FALSE}, which case it will -break and provide error message with a list of offending geolocations.} } \value{ -Output will be an object of class \code{data.frame}. It will -consist of the original user-input data with additional columns of -Census data. +Output will be an object of class \code{data.frame}. It will + consist of the original user-input data with additional columns of + Census data. } \description{ \code{census_helper_new} links user-input dataset with Census geographic data. } \details{ -This function allows users to link their geocoded dataset (e.g., voter file) -with U.S. Census data (2010 or 2020). The function extracts Census Summary File data -at the county, tract, block, or place level. Census data calculated are +This function allows users to link their geocoded dataset (e.g., voter file) +with U.S. Census data (2010 or 2020). The function extracts Census Summary File data +at the county, tract, block, or place level. Census data calculated are Pr(Geolocation | Race) where geolocation is county, tract, block, or place. } \examples{ \dontshow{data(voters)} -\dontrun{census_helper_new(voter.file = voters, states = "nj", geo = "block")} -\dontrun{census_helper_new(voter.file = voters, states = "all", geo = "tract")} -\dontrun{census_helper_new(voter.file = voters, states = "all", geo = "place", +\dontrun{census_helper_new(key = "...", voter.file = voters, states = "nj", geo = "block")} +\dontrun{census_helper_new(key = "...", voter.file = voters, states = "all", geo = "tract")} +\dontrun{census_helper_new(key = "...", voter.file = voters, states = "all", geo = "place", year = "2020")} } diff --git a/man/figures/logo.png b/man/figures/logo.png deleted file mode 100644 index b279633..0000000 Binary files a/man/figures/logo.png and /dev/null differ diff --git a/man/format_legacy_data.Rd b/man/format_legacy_data.Rd index 2f7cc50..eefc5f3 100644 --- a/man/format_legacy_data.Rd +++ b/man/format_legacy_data.Rd @@ -8,7 +8,7 @@ format_legacy_data(legacyFilePath, state, outFile = NULL) } \arguments{ \item{legacyFilePath}{A character vector giving the location of a legacy census data folder, -sourced from https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. +sourced from https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. These file names should end in ".pl".} \item{state}{The two letter state postal code.} @@ -21,10 +21,10 @@ filepath should end in ".RData".} for Bayesian name geocoding. } \details{ -This function allows users to construct datasets for analysis using the census legacy data format. -These data are available for the 2020 census at -https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. -This function returns data structured analogously to data from the Census API, which is not yet +This function allows users to construct datasets for analysis using the census legacy data format. +These data are available for the 2020 census at +https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. +This function returns data structured analogously to data from the Census API, which is not yet available for the 2020 Census as of September 2021. } \examples{ diff --git a/man/get_census_api.Rd b/man/get_census_api.Rd index c3ebc13..cf99156 100644 --- a/man/get_census_api.Rd +++ b/man/get_census_api.Rd @@ -4,24 +4,14 @@ \alias{get_census_api} \title{Census API function.} \usage{ -get_census_api( - data_url, - key = Sys.getenv("CENSUS_API_KEY"), - var.names, - region, - retry = 0 -) +get_census_api(data_url, key, var.names, region, retry = 0) } \arguments{ \item{data_url}{URL root of the API, e.g., \code{"https://api.census.gov/data/2020/dec/pl"}.} -\item{key}{A character string containing a valid Census API key, -which can be requested from the -\href{https://api.census.gov/data/key_signup.html}{U.S. Census API key signup page}. - -By default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} +\item{key}{A required character object containing user's Census API key, +which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} \item{var.names}{A character vector of variables to get, e.g., \code{c("P2_005N", "P2_006N", "P2_007N", "P2_008N")}. @@ -36,7 +26,7 @@ e.g., \code{"for=block:1213&in=state:47+county:015+tract:*"}.} } \value{ If successful, output will be an object of class \code{data.frame}. -If unsuccessful, function prints the URL query that caused the error. + If unsuccessful, function prints the URL query that caused the error. } \description{ \code{get_census_api} obtains U.S. Census data via the public API. @@ -48,7 +38,7 @@ can specify the variables and region(s) for which to obtain data. \examples{ \dontrun{ get_census_api( - data_url = "https://api.census.gov/data/2020/dec/pl", + data_url = "https://api.census.gov/data/2020/dec/pl", key = "...", var.names = c("P2_005N", "P2_006N", "P2_007N", "P2_008N"), region = "for=county:*&in=state:34" ) } diff --git a/man/get_census_api_2.Rd b/man/get_census_api_2.Rd index a66146f..065fe9e 100644 --- a/man/get_census_api_2.Rd +++ b/man/get_census_api_2.Rd @@ -4,55 +4,45 @@ \alias{get_census_api_2} \title{Census API URL assembler.} \usage{ -get_census_api_2( - data_url, - key = Sys.getenv("CENSUS_API_KEY"), - get, - region, - retry = 3 -) +get_census_api_2(data_url, key, get, region, retry = 3) } \arguments{ \item{data_url}{URL root of the API, e.g., \code{"https://api.census.gov/data/2020/dec/pl"}.} -\item{key}{A character string containing a valid Census API key, -which can be requested from the -\href{https://api.census.gov/data/key_signup.html}{U.S. Census API key signup page}. +\item{key}{A required character object containing user's Census API key, +which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} -By default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} - -\item{get}{A character vector of variables to get, +\item{get}{A character vector of variables to get, e.g., \code{c("P2_005N", "P2_006N", "P2_007N", "P2_008N")}. -If there are more than 50 variables, then function will automatically +If there are more than 50 variables, then function will automatically split variables into separate queries.} \item{region}{Character object specifying which region to obtain data for. -Must contain "for" and possibly "in", +Must contain "for" and possibly "in", e.g., \code{"for=block:1213&in=state:47+county:015+tract:*"}.} \item{retry}{The number of retries at the census website if network interruption occurs.} } \value{ -If successful, output will be an object of class \code{data.frame}. -If unsuccessful, function prints the URL query that was constructed. +If successful, output will be an object of class \code{data.frame}. + If unsuccessful, function prints the URL query that was constructed. } \description{ \code{get_census_api_2} assembles URL components for \code{get_census_api}. } \details{ -This function assembles the URL components and sends the request to the Census server. -It is used by the \code{get_census_api} function. The user should not need to call this +This function assembles the URL components and sends the request to the Census server. +It is used by the \code{get_census_api} function. The user should not need to call this function directly. } \examples{ -\dontrun{try(get_census_api_2(data_url = "https://api.census.gov/data/2020/dec/pl", +\dontrun{try(get_census_api_2(data_url = "https://api.census.gov/data/2020/dec/pl", key = "...", get = c("P2_005N", "P2_006N", "P2_007N", "P2_008N"), region = "for=county:*&in=state:34"))} } \references{ -Based on code authored by Nicholas Nagle, which is available +Based on code authored by Nicholas Nagle, which is available \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. } \keyword{internal} diff --git a/man/get_census_data.Rd b/man/get_census_data.Rd index e50d53e..dce4371 100644 --- a/man/get_census_data.Rd +++ b/man/get_census_data.Rd @@ -5,42 +5,38 @@ \title{Multilevel Census data download function.} \usage{ get_census_data( - key = Sys.getenv("CENSUS_API_KEY"), + key = NULL, states, age = FALSE, sex = FALSE, year = "2020", - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), + census.geo = "block", retry = 3, county.list = NULL ) } \arguments{ -\item{key}{A character string containing a valid Census API key, -which can be requested from the -\href{https://api.census.gov/data/key_signup.html}{U.S. Census API key signup page}. - -By default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} +\item{key}{A required character object containing a valid Census API key, +which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} \item{states}{which states to extract Census data for, e.g., \code{c("NJ", "NY")}.} -\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +If \code{TRUE}, function will return Pr(Geolocation, Age | Race). If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} -\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} \item{year}{A character object specifying the year of U.S. Census data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}. -Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and +Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and \code{\var{sex}} are both \code{FALSE}.} -\item{census.geo}{An optional character vector specifying what level of +\item{census.geo}{An optional character vector specifying what level of geography to use to merge in U.S. Census 2010 geographic data. Currently \code{"county"}, \code{"tract"}, \code{"block"}, and \code{"place"} are supported.} @@ -49,17 +45,17 @@ geography to use to merge in U.S. Census 2010 geographic data. Currently \item{county.list}{A named list of character vectors of counties present in your \var{voter.file}, per state.} } \value{ -Output will be an object of class \code{list} indexed by state. -Output will contain a subset of the following elements: -\code{state}, \code{age}, \code{sex}, +Output will be an object of class \code{list} indexed by state. +Output will contain a subset of the following elements: +\code{state}, \code{age}, \code{sex}, \code{county}, \code{tract}, \code{block_group}, \code{block}, and \code{place}. } \description{ -\code{get_census_data} returns county-, tract-, and block-level Census data -for specified state(s). Using this function to download Census data in advance +\code{get_census_data} returns county-, tract-, and block-level Census data +for specified state(s). Using this function to download Census data in advance can save considerable time when running \code{predict_race} and \code{census_helper}. } \examples{ -\dontrun{get_census_data(states = c("NJ", "NY"), age = TRUE, sex = FALSE)} -\dontrun{get_census_data(states = "MN", age = FALSE, sex = FALSE, year = "2020")} +\dontrun{get_census_data(key = "...", states = c("NJ", "NY"), age = TRUE, sex = FALSE)} +\dontrun{get_census_data(key = "...", states = "MN", age = FALSE, sex = FALSE, year = "2020")} } diff --git a/man/merge_names.Rd b/man/merge_names.Rd index ab98bee..114c6fe 100644 --- a/man/merge_names.Rd +++ b/man/merge_names.Rd @@ -58,40 +58,38 @@ in order to increase the chance of finding a match. Default is \code{TRUE}.} } \value{ Output will be an object of class \code{data.frame}. It will -consist of the original user-input data with additional columns that -specify the part of the name matched with Census data (\code{\var{surname.match}}), -and the probabilities Pr(Race | Surname) for each racial group -(\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, -\code{\var{p_his}} for Hispanic/Latino, -\code{\var{p_asi}} for Asian and Pacific Islander, and -\code{\var{p_oth}} for Other/Mixed). + consist of the original user-input data with additional columns that + specify the part of the name matched with Census data (\code{\var{surname.match}}), + and the probabilities Pr(Race | Surname) for each racial group + (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, + \code{\var{p_his}} for Hispanic/Latino, + \code{\var{p_asi}} for Asian and Pacific Islander, and + \code{\var{p_oth}} for Other/Mixed). } \description{ \code{merge_names} merges names in a user-input dataset with corresponding -race/ethnicity probabilities derived from both the U.S. Census Surname List -and Spanish Surname List and voter files from states in the Southern U.S. + race/ethnicity probabilities derived from both the U.S. Census Surname List + and Spanish Surname List and voter files from states in the Southern U.S. } \details{ This function allows users to match names in their dataset with database entries -estimating P(name | ethnicity) for each of the five major racial groups for each -name. The database probabilities are derived from both the U.S. Census Surname List -and Spanish Surname List and voter files from states in the Southern U.S. + estimating P(name | ethnicity) for each of the five major racial groups for each + name. The database probabilities are derived from both the U.S. Census Surname List + and Spanish Surname List and voter files from states in the Southern U.S. -By default, the function matches names as follows: -\enumerate{ -\item Search raw surnames in the database; -\item Remove any punctuation and search again; -\item Remove any spaces and search again; -\item Remove suffixes (e.g., "Jr") and search again (last names only) -\item Split double-barreled names into two parts and search first part of name; -\item Split double-barreled names into two parts and search second part of name; -} + By default, the function matches names as follows: + 1) Search raw surnames in the database; + 2) Remove any punctuation and search again; + 3) Remove any spaces and search again; + 4) Remove suffixes (e.g., "Jr") and search again (last names only) + 5) Split double-barreled names into two parts and search first part of name; + 6) Split double-barreled names into two parts and search second part of name; -Each step only applies to names not matched in a previous step. -Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. + Each step only applies to names not matched in a previous step. + Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. -Note: Any name appearing only on the Spanish Surname List is assigned a -probability of 1 for Hispanics/Latinos and 0 for all other racial groups. + Note: Any name appearing only on the Spanish Surname List is assigned a + probability of 1 for Hispanics/Latinos and 0 for all other racial groups. } \examples{ data(voters) diff --git a/man/merge_surnames.Rd b/man/merge_surnames.Rd index 08c257d..a4ccd94 100644 --- a/man/merge_surnames.Rd +++ b/man/merge_surnames.Rd @@ -13,64 +13,62 @@ merge_surnames( ) } \arguments{ -\item{voter.file}{An object of class \code{data.frame}. Must contain a field +\item{voter.file}{An object of class \code{data.frame}. Must contain a field named 'surname' containing list of surnames to be merged with Census lists.} -\item{surname.year}{An object of class \code{numeric} indicating which year -Census Surname List is from. Accepted values are \code{2010} and \code{2000}. +\item{surname.year}{An object of class \code{numeric} indicating which year +Census Surname List is from. Accepted values are \code{2010} and \code{2000}. Default is \code{2020}.} -\item{name.data}{An object of class \code{data.frame}. Must contain a leading -column of surnames, and 5 subsequent columns, with Pr(Race | Surname) for each +\item{name.data}{An object of class \code{data.frame}. Must contain a leading +column of surnames, and 5 subsequent columns, with Pr(Race | Surname) for each of the five major racial categories.} -\item{clean.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -any surnames in \code{\var{voter.file}} that cannot initially be matched -to surname lists will be cleaned, according to U.S. Census specifications, +\item{clean.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +any surnames in \code{\var{voter.file}} that cannot initially be matched +to surname lists will be cleaned, according to U.S. Census specifications, in order to increase the chance of finding a match. Default is \code{TRUE}.} -\item{impute.missing}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -race/ethnicity probabilities will be imputed for unmatched names using +\item{impute.missing}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +race/ethnicity probabilities will be imputed for unmatched names using race/ethnicity distribution for all other names (i.e., not on Census List). Default is \code{TRUE}.} } \value{ -Output will be an object of class \code{data.frame}. It will -consist of the original user-input data with additional columns that -specify the part of the name matched with Census data (\code{\var{surname.match}}), -and the probabilities Pr(Race | Surname) for each racial group -(\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, -\code{\var{p_his}} for Hispanic/Latino, -\code{\var{p_asi}} for Asian and Pacific Islander, and -\code{\var{p_oth}} for Other/Mixed). +Output will be an object of class \code{data.frame}. It will + consist of the original user-input data with additional columns that + specify the part of the name matched with Census data (\code{\var{surname.match}}), + and the probabilities Pr(Race | Surname) for each racial group + (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, + \code{\var{p_his}} for Hispanic/Latino, + \code{\var{p_asi}} for Asian and Pacific Islander, and + \code{\var{p_oth}} for Other/Mixed). #' } \description{ -\code{merge_surnames} merges surnames in user-input dataset with corresponding -race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. +\code{merge_surnames} merges surnames in user-input dataset with corresponding + race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. } \details{ -This function allows users to match surnames in their dataset with the U.S. -Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain -Pr(Race | Surname) for each of the five major racial groups. - -By default, the function matches surnames to the Census list as follows: -\enumerate{ -\item Search raw surnames in Census surname list; -\item Remove any punctuation and search again; -\item Remove any spaces and search again; -\item Remove suffixes (e.g., Jr) and search again; -\item Split double-barreled surnames into two parts and search first part of name; -\item Split double-barreled surnames into two parts and search second part of name; -\item For any remaining names, impute probabilities using distribution -for all names not appearing on Census list. -} - -Each step only applies to surnames not matched in a previous ste. -Steps 2 through 7 are not applied if \code{clean.surname} is FALSE. - -Note: Any name appearing only on the Spanish Surname List is assigned a -probability of 1 for Hispanics/Latinos and 0 for all other racial groups. +This function allows users to match surnames in their dataset with the U.S. + Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain + Pr(Race | Surname) for each of the five major racial groups. + + By default, the function matches surnames to the Census list as follows: + 1) Search raw surnames in Census surname list; + 2) Remove any punctuation and search again; + 3) Remove any spaces and search again; + 4) Remove suffixes (e.g., Jr) and search again; + 5) Split double-barreled surnames into two parts and search first part of name; + 6) Split double-barreled surnames into two parts and search second part of name; + 7) For any remaining names, impute probabilities using distribution + for all names not appearing on Census list. + + Each step only applies to surnames not matched in a previous ste. + Steps 2 through 7 are not applied if \code{clean.surname} is FALSE. + + Note: Any name appearing only on the Spanish Surname List is assigned a + probability of 1 for Hispanics/Latinos and 0 for all other racial groups. } \examples{ data(voters) diff --git a/man/modfuns.Rd b/man/modfuns.Rd index 71481a8..9702621 100644 --- a/man/modfuns.Rd +++ b/man/modfuns.Rd @@ -14,7 +14,7 @@ surname.year = 2020, name.dictionaries = NULL, census.geo, - census.key = Sys.getenv("CENSUS_API_KEY"), + census.key, census.data = NULL, age = FALSE, sex = FALSE, @@ -31,14 +31,13 @@ predict_race_new( year = "2020", age = FALSE, sex = FALSE, - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), - census.key = Sys.getenv("CENSUS_API_KEY"), + census.geo, + census.key = NULL, name.dictionaries, surname.only = FALSE, census.data = NULL, retry = 0, impute.missing = TRUE, - skip_bad_geos = FALSE, census.surname = FALSE, use.counties = FALSE ) @@ -49,8 +48,8 @@ predict_race_me( year = "2020", age = FALSE, sex = FALSE, - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), - census.key = Sys.getenv("CENSUS_API_KEY"), + census.geo, + census.key, name.dictionaries, surname.only = FALSE, census.data = NULL, @@ -75,12 +74,7 @@ predict_race_me( \item{census.geo}{See documentation in \code{race_predict}.} -\item{census.key}{A character object specifying user's Census API key. -Required if \code{census.geo} is specified, because a valid Census API key is -required to download Census geographic data. - -If \code{\link{NULL}}, the default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} +\item{census.key}{See documentation in \code{race_predict}.} \item{census.data}{See documentation in \code{race_predict}.} @@ -100,25 +94,16 @@ If \code{\link{NULL}}, the default, attempts to find a census key stored in an \item{names.to.use}{See documentation in \code{race_predict}.} -\item{skip_bad_geos}{See documentation in \code{race_predict}.} - \item{race.init}{See documentation in \code{race_predict}.} -\item{ctrl}{See \code{control} in documentation for \code{\link[=predict_race]{predict_race()}}.} +\item{ctrl}{See \code{control} in documentation for \code{race_predict}.} } \value{ -Output will be an object of class \code{data.frame}. It will -consist of the original user-input \code{voter.file} with additional columns with -predicted probabilities for each of the five major racial categories: -\code{\var{pred.whi}} for White, -\code{\var{pred.bla}} for Black, -\code{\var{pred.his}} for Hispanic/Latino, -\code{\var{pred.asi}} for Asian/Pacific Islander, and -\code{\var{pred.oth}} for Other/Mixed. +See documentation in \code{race_predict}. } \description{ These functions are intended for internal use only. Users should use the -\code{\link[=predict_race]{predict_race()}} interface rather any of these functions directly. +\code{race_predict} interface rather any of these functions directly. } \details{ These functions fit different versions of WRU. \code{.predict_race_old} fits diff --git a/man/predict_race.Rd b/man/predict_race.Rd index f3bbcd7..5b80be6 100644 --- a/man/predict_race.Rd +++ b/man/predict_race.Rd @@ -8,8 +8,8 @@ predict_race( voter.file, census.surname = TRUE, surname.only = FALSE, - census.geo = c("tract", "block", "block_group", "county", "place", "zcta"), - census.key = Sys.getenv("CENSUS_API_KEY"), + census.geo, + census.key = NULL, census.data = NULL, age = FALSE, sex = FALSE, @@ -17,7 +17,6 @@ predict_race( party = NULL, retry = 3, impute.missing = TRUE, - skip_bad_geos = FALSE, use.counties = FALSE, model = "BISG", race.init = NULL, @@ -35,12 +34,12 @@ If using geolocation in predictions, \code{\var{voter.file}} must contain a fiel state of residence (e.g., \code{"nj"} for New Jersey). If using Census geographic data in race/ethnicity predictions, \code{\var{voter.file}} must also contain at least one of the following fields: -\code{\var{county}}, \code{\var{tract}}, \code{\var{block_group}}, \code{\var{block}}, +\code{\var{county}}, \code{\var{tract}}, \code{\var{block_group}}, \code{\var{block}}, and/or \code{\var{place}}. These fields should contain character strings matching U.S. Census categories. County is three characters (e.g., \code{"031"} not \code{"31"}), tract is six characters, block group is usually a single character and block -is four characters. Place is five characters. + is four characters. Place is five characters. See below for other optional fields.} \item{census.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, @@ -54,7 +53,7 @@ only use surname data and calculate Pr(Race | Surname). Default is \code{FALSE}. \item{census.geo}{An optional character vector specifying what level of geography to use to merge in U.S. Census geographic data. Currently -\code{"county"}, \code{"tract"}, \code{"block_group"}, \code{"block"}, and \code{"place"} +\code{"county"}, \code{"tract"}, \code{"block_group"}, \code{"block"}, and \code{"place"} are supported. Note: sufficient information must be in user-defined \code{\var{voter.file}} object. If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} @@ -65,16 +64,12 @@ And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} must have columns named \code{county}, \code{tract}, and \code{block}. If \code{\var{census.geo} = "place"}, then \code{\var{voter.file}} must have column named \code{place}. -If \code{census.geo = "zcta"}, then \code{voter.file} must have column named \code{zcta}. Specifying \code{\var{census.geo}} will call \code{census_helper} function to merge Census geographic data at specified level of geography.} -\item{census.key}{A character object specifying user's Census API key. -Required if \code{census.geo} is specified, because a valid Census API key is -required to download Census geographic data. - -If \code{\link{NULL}}, the default, attempts to find a census key stored in an -\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} +\item{census.key}{A character object specifying user's Census API +key. Required if \code{\var{census.geo}} is specified, because +a valid Census API key is required to download Census geographic data.} \item{census.data}{A list indexed by two-letter state abbreviations, which contains pre-saved Census geographic data. @@ -107,23 +102,19 @@ it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.} \item{impute.missing}{Logical, defaults to TRUE. Should missing be imputed?} -\item{skip_bad_geos}{Logical. Option to have the function skip any geolocations that are not present -in the census data, returning a partial data set. Default is set to \code{FALSE}, in which case it -will break and provide error message with a list of offending geolocations.} - -\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties +\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}?} -\item{model}{Character string, either "BISG" (default) or "fBISG" (for error-correction, +\item{model}{Character string, either "BISG" (default) or "fBISG" (for error-correction, fully-Bayesian model).} \item{race.init}{Vector of initial race for each observation in voter.file. -Must be an integer vector, with 1=white, 2=black, 3=hispanic, 4=asian, and +Must be an integer vector, with 1=white, 2=black, 3=hispanic, 4=asian, and 5=other. Defaults to values obtained using \code{model="BISG_surname"}.} -\item{name.dictionaries}{Optional named list of \code{data.frame}'s -containing counts of names by race. Any of the following named elements -are allowed: "surname", "first", "middle". When present, the objects must +\item{name.dictionaries}{Optional named list of \code{data.frame}'s +containing counts of names by race. Any of the following named elements +are allowed: "surname", "first", "middle". When present, the objects must follow the same structure as \code{last_c}, \code{first_c}, \code{mid_c}, respectively.} @@ -131,23 +122,23 @@ follow the same structure as \code{last_c}, \code{first_c}, middle'. Defaults to 'surname'.} \item{control}{List of control arguments only used when \code{model="fBISG"}, including -\describe{ -\item{iter}{Number of MCMC iterations. Defaults to 1000.} -\item{burnin}{Number of iterations discarded as burnin. Defaults to half of \code{iter}.} -\item{verbose}{Print progress information. Defaults to \code{TRUE}.} -\item{me.correct}{Boolean. Should the model correct measurement error for \code{races|geo}? Defaults to \code{TRUE}.} -\item{seed}{RNG seed. If \code{NULL}, a seed is generated and returned as an attribute for reproducibility.} +\itemize{ + \item{iter}{ Number of MCMC iterations. Defaults to 1000.} + \item{burnin}{ Number of iterations discarded as burnin. Defaults to half of \code{iter}.} + \item{verbose}{ Print progress information. Defaults to \code{TRUE}.} + \item{me.correct}{ Boolean. Should the model correcting measurement error for \code{races|geo}? Defaults to \code{TRUE}.} + \item{seed}{ RNG seed. If \code{NULL}, a seed is generated and returned as an attribute for reproducibility.} }} } \value{ Output will be an object of class \code{data.frame}. It will -consist of the original user-input \code{voter.file} with additional columns with -predicted probabilities for each of the five major racial categories: -\code{\var{pred.whi}} for White, -\code{\var{pred.bla}} for Black, -\code{\var{pred.his}} for Hispanic/Latino, -\code{\var{pred.asi}} for Asian/Pacific Islander, and -\code{\var{pred.oth}} for Other/Mixed. + consist of the original user-input \code{voter.file} with additional columns with + predicted probabilities for each of the five major racial categories: + \code{\var{pred.whi}} for White, + \code{\var{pred.bla}} for Black, + \code{\var{pred.his}} for Hispanic/Latino, + \code{\var{pred.asi}} for Asian/Pacific Islander, and + \code{\var{pred.oth}} for Other/Mixed. } \description{ \code{predict_race} makes probabilistic estimates of individual-level race/ethnicity. @@ -162,25 +153,25 @@ individual-level race/ethnicity, based on surname, geolocation, and party. #' data(voters) try(predict_race(voter.file = voters, surname.only = TRUE)) \dontrun{ -try(predict_race(voter.file = voters, census.geo = "tract")) +try(predict_race(voter.file = voters, census.geo = "tract", census.key = "...")) } \dontrun{ try(predict_race( - voter.file = voters, census.geo = "place", year = "2020")) + voter.file = voters, census.geo = "place", census.key = "...", year = "2020")) } \dontrun{ -CensusObj <- try(get_census_data(state = c("NY", "DC", "NJ"))) +CensusObj <- try(get_census_data("...", state = c("NY", "DC", "NJ"))) try(predict_race( voter.file = voters, census.geo = "tract", census.data = CensusObj, party = "PID") ) } \dontrun{ -CensusObj2 <- try(get_census_data(state = c("NY", "DC", "NJ"), age = T, sex = T)) +CensusObj2 <- try(get_census_data(key = "...", state = c("NY", "DC", "NJ"), age = T, sex = T)) try(predict_race( voter.file = voters, census.geo = "tract", census.data = CensusObj2, age = T, sex = T)) } \dontrun{ -CensusObj3 <- try(get_census_data(state = c("NY", "DC", "NJ"), census.geo = "place")) +CensusObj3 <- try(get_census_data(key = "...", state = c("NY", "DC", "NJ"), census.geo = "place")) try(predict_race(voter.file = voters, census.geo = "place", census.data = CensusObj3)) } } diff --git a/man/sample_me.Rd b/man/sample_me.Rd index eab7839..4e82121 100644 --- a/man/sample_me.Rd +++ b/man/sample_me.Rd @@ -26,17 +26,15 @@ sample_me( \item{first_name}{See last_name} -\item{mid_name}{See last_name} - \item{geo}{Integer vector of geographic units for each record. Must match column number in N_rg} \item{N_rg}{Integer matrix of race | geography counts in census (geograpgies in columns).} \item{pi_s}{Numeric matrix of race | surname prior probabilities.} -\item{pi_f}{Same as \code{pi_s}, but for first names.} +\item{pi_f}{Same as `pi_s`, but for first names.} -\item{pi_m}{Same as \code{pi_s}, but for middle names.} +\item{pi_m}{Same as `pi_s`, but for middle names.} \item{pi_nr}{Matrix of marginal probability distribution over missing names; non-keyword names default to this distribution.} @@ -49,6 +47,18 @@ sample_me( \item{race_init}{Integer vector of initial race assignments} \item{verbose}{Boolean; should informative messages be printed?} + +\item{middle_name}{See last_name} + +\item{M_rs}{Integer matrix of race | surname counts in dictionary (surnames in columns).} + +\item{M_rf}{Same as `M_rs`, but for first names (can be empty matrix for surname only models).} + +\item{M_rm}{Same as `M_rs`, but for middle names (can be empty matrix for surname, or surname and first name only models).} + +\item{alpha}{Numeric matrix of race | geography prior probabilities.} + +\item{me_race}{Boolean; should measurement error in race | geography be corrected?} } \description{ Collapsed Gibbs sampler for hWRU. Internal function diff --git a/man/state_fips.Rd b/man/state_fips.Rd deleted file mode 100644 index 23a6fc2..0000000 --- a/man/state_fips.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_state_fips.R -\docType{data} -\name{state_fips} -\alias{state_fips} -\title{Dataset with FIPS codes for US states} -\format{ -A tibble with 57 rows and 3 columns: -\describe{ -\item{\code{state}}{Two-letter postal abbreviation} -\item{\code{state_code}}{Two-digit FIPS code} -\item{\code{state_name}}{English name} -} -} -\source{ -Derived from \code{\link[tidycensus:fips_codes]{tidycensus::fips_codes()}} -} -\usage{ -state_fips -} -\description{ -Dataset including FIPS codes and postal abbreviations for each U.S. state, -district, and territory. -} -\keyword{datasets} diff --git a/man/surnames2000.Rd b/man/surnames2000.Rd index 6dc45f5..00a2c30 100644 --- a/man/surnames2000.Rd +++ b/man/surnames2000.Rd @@ -7,13 +7,13 @@ \format{ A data frame with 157,728 rows and 6 variables: \describe{ -\item{surname}{Surname} -\item{p_whi}{Pr(White | Surname)} -\item{p_bla}{Pr(Black | Surname)} -\item{p_his}{Pr(Hispanic/Latino | Surname)} -\item{p_asi}{Pr(Asian/Pacific Islander | Surname)} -\item{p_oth}{Pr(Other | Surname)} -#' } + \item{surname}{Surname} + \item{p_whi}{Pr(White | Surname)} + \item{p_bla}{Pr(Black | Surname)} + \item{p_his}{Pr(Hispanic/Latino | Surname)} + \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} + \item{p_oth}{Pr(Other | Surname)} + #' } } \usage{ surnames2000 diff --git a/man/surnames2010.Rd b/man/surnames2010.Rd index a09afdf..7341985 100644 --- a/man/surnames2010.Rd +++ b/man/surnames2010.Rd @@ -7,13 +7,13 @@ \format{ A data frame with 167,613 rows and 6 variables: \describe{ -\item{surname}{Surname} -\item{p_whi}{Pr(White | Surname)} -\item{p_bla}{Pr(Black | Surname)} -\item{p_his}{Pr(Hispanic/Latino | Surname)} -\item{p_asi}{Pr(Asian/Pacific Islander | Surname)} -\item{p_oth}{Pr(Other | Surname)} -#' } + \item{surname}{Surname} + \item{p_whi}{Pr(White | Surname)} + \item{p_bla}{Pr(Black | Surname)} + \item{p_his}{Pr(Hispanic/Latino | Surname)} + \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} + \item{p_oth}{Pr(Other | Surname)} + #' } } \usage{ surnames2010 diff --git a/man/vec_to_chunk.Rd b/man/vec_to_chunk.Rd index fe5ae7f..eb77daa 100644 --- a/man/vec_to_chunk.Rd +++ b/man/vec_to_chunk.Rd @@ -16,9 +16,9 @@ Object of class \code{list}. \code{vec_to_chunk} takes a list of variables and collects them into 50-variable chunks. } \details{ -This function takes a list of variable names and collects them into chunks with no more than -50 variables each. This helps to get around requests with more than 50 variables,because the -API only allows queries of 50 variables at a time. +This function takes a list of variable names and collects them into chunks with no more than +50 variables each. This helps to get around requests with more than 50 variables,because the +API only allows queries of 50 variables at a time. The user should not need to call this function directly. } \examples{ @@ -29,7 +29,7 @@ vec_to_chunk(x = c(paste("P012F0", seq(10:49), sep = ""), } \references{ -Based on code authored by Nicholas Nagle, which is available +Based on code authored by Nicholas Nagle, which is available \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. } \keyword{internal} diff --git a/man/voters.Rd b/man/voters.Rd index be5d7bc..3637041 100644 --- a/man/voters.Rd +++ b/man/voters.Rd @@ -7,22 +7,22 @@ \format{ A data frame with 10 rows and 12 variables: \describe{ -\item{VoterID}{Voter identifier (numeric)} -\item{surname}{Surname} -\item{state}{State of residence} -\item{CD}{Congressional district} -\item{county}{Census county (three-digit code)} -\item{first}{First name} -\item{last}{Last name or surname} -\item{tract}{Census tract (six-digit code)} -\item{block}{Census block (four-digit code)} -\item{precinct}{Voting precinct} -\item{place}{Voting place} -\item{age}{Age in years} -\item{sex}{0=male, 1=female} -\item{party}{Party registration (character)} -\item{PID}{Party registration (numeric)} -#' } + \item{VoterID}{Voter identifier (numeric)} + \item{surname}{Surname} + \item{state}{State of residence} + \item{CD}{Congressional district} + \item{county}{Census county (three-digit code)} + \item{first}{First name} + \item{last}{Last name or surname} + \item{tract}{Census tract (six-digit code)} + \item{block}{Census block (four-digit code)} + \item{precinct}{Voting precinct} + \item{place}{Voting place} + \item{age}{Age in years} + \item{sex}{0=male, 1=female} + \item{party}{Party registration (character)} + \item{PID}{Party registration (numeric)} + #' } } \usage{ voters diff --git a/man/wru_data_preflight.Rd b/man/wru_data_preflight.Rd index 7da9e58..cc93d37 100644 --- a/man/wru_data_preflight.Rd +++ b/man/wru_data_preflight.Rd @@ -10,6 +10,6 @@ wru_data_preflight() Checks if namedata is available in the current working directory, if not downloads it from github using piggyback. By default, wru will download the data to a temporary directory that lasts as long as your session does. -However, you may wish to set the \code{wru_data_wd} option to save the +However, you may wish to set the \code{wru_data_wd} option to save the downloaded data to your current working directory for more permanence. } diff --git a/src/Makevars b/src/Makevars index e4d22e6..c532650 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ -CXX_STD = CXX17 +CXX_STD = CXX11 PKG_CPPFLAGS = -DARMA_64BIT_WORD=1 \ No newline at end of file diff --git a/src/sample_me.cpp b/src/sample_me.cpp index 52749b9..8eae6a3 100644 --- a/src/sample_me.cpp +++ b/src/sample_me.cpp @@ -4,9 +4,13 @@ //' //' @param last_name Integer vector of last name identifiers for each record (zero indexed; as all that follow). Must match columns numbers in M_rs. //' @param first_name See last_name -//' @param mid_name See last_name +//' @param middle_name See last_name //' @param geo Integer vector of geographic units for each record. Must match column number in N_rg //' @param N_rg Integer matrix of race | geography counts in census (geograpgies in columns). +//' @param M_rs Integer matrix of race | surname counts in dictionary (surnames in columns). +//' @param M_rf Same as `M_rs`, but for first names (can be empty matrix for surname only models). +//' @param M_rm Same as `M_rs`, but for middle names (can be empty matrix for surname, or surname and first name only models). +//' @param alpha Numeric matrix of race | geography prior probabilities. //' @param pi_s Numeric matrix of race | surname prior probabilities. //' @param pi_f Same as `pi_s`, but for first names. //' @param pi_m Same as `pi_s`, but for middle names. @@ -14,6 +18,7 @@ //' @param which_names Integer; 0=surname only. 1=surname + first name. 2= surname, first, and middle names. //' @param samples Integer number of samples to take after (in total) //' @param burnin Integer number of samples to discard as burn-in of Markov chain +//' @param me_race Boolean; should measurement error in race | geography be corrected? //' @param race_init Integer vector of initial race assignments //' @param verbose Boolean; should informative messages be printed? //' diff --git a/src/sample_me.h b/src/sample_me.h index d73d705..894db14 100644 --- a/src/sample_me.h +++ b/src/sample_me.h @@ -1,6 +1,7 @@ #ifndef _HWRUME #define _HWRUME +// [[Rcpp::plugins(cpp11)]] #include #include "aux_funs.h" diff --git a/tests/testthat/_snaps/census_geo_api.md b/tests/testthat/_snaps/census_geo_api.md deleted file mode 100644 index 08d8ba2..0000000 --- a/tests/testthat/_snaps/census_geo_api.md +++ /dev/null @@ -1,918 +0,0 @@ -# snapshot - - structure(list(state = c("DE", "DE", "DE"), county = c("003", - "005", "001"), P12I_001N = c(303265, 171741, 104845), P12J_001N = c(142388, - 24835, 45737), P12H_001N = c(63516, 26793, 13981), P12L_001N = c(35022, - 3014, 4362), P12M_001N = c(127, 70, 107), P12K_001N = c(893, - 764, 864), P12N_001N = c(2555, 1018, 1028), P12O_001N = c(22953, - 9143, 10927), r_whi = c(0.523005047848499, 0.296181260358264, - 0.180813691793237), r_bla = c(0.668613824192337, 0.116618144252442, - 0.214768031555222), r_his = c(0.609032505513472, 0.256908620193691, - 0.134058874292837), r_asi = c(0.823123038733549, 0.0722214416186596, - 0.104655519647792), r_oth = c(0.526493169807558, 0.217868182271413, - 0.255638647921029)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, - -3L)) - ---- - - structure(list(state = c("DE", "DE", "DE"), county = c("001", - "005", "003"), PCT012I001 = c(105891, 149025, 331836), PCT012J001 = c(37812, - 24544, 124426), PCT012H001 = c(9346, 16954, 46921), PCT012L001 = c(3266, - 1910, 23132), PCT012M001 = c(74, 62, 102), PCT012K001 = c(916, - 924, 984), PCT012N001 = c(347, 305, 873), PCT012O001 = c(4658, - 3421, 10205), r_whi = c(0.180469772578534, 0.253982943390052, - 0.565547284031414), r_bla = c(0.20243920720412, 0.131404525061301, - 0.666156267734578), r_his = c(0.127640977315251, 0.23154559484301, - 0.64081342784174), r_asi = c(0.117004133678974, 0.0690814825194423, - 0.813914383801583), r_oth = c(0.261609154774003, 0.205452215791101, - 0.532938629434896)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, - -3L)) - ---- - - structure(list(state = c("DE", "DE", "DE"), county = c("001", - "005", "003"), PCT012I001 = c(105891, 149025, 331836), PCT012J001 = c(37812, - 24544, 124426), PCT012H001 = c(9346, 16954, 46921), PCT012L001 = c(3266, - 1910, 23132), PCT012M001 = c(74, 62, 102), PCT012K001 = c(916, - 924, 984), PCT012N001 = c(347, 305, 873), PCT012O001 = c(4658, - 3421, 10205), PCT012I002 = c(51574, 72032, 160694), PCT012J002 = c(17702, - 11860, 58678), PCT012H002 = c(4579, 9154, 24193), PCT012L002 = c(1339, - 875, 11390), PCT012M002 = c(33, 29, 42), PCT012K002 = c(448, - 481, 473), PCT012N002 = c(163, 152, 431), PCT012O002 = c(2197, - 1650, 4770), PCT012I106 = c(54317, 76993, 171142), PCT012J106 = c(20110, - 12684, 65748), PCT012H106 = c(4767, 7800, 22728), PCT012L106 = c(1927, - 1035, 11742), PCT012M106 = c(41, 33, 60), PCT012K106 = c(468, - 443, 511), PCT012N106 = c(184, 153, 442), PCT012O106 = c(2461, - 1771, 5435), r_whi = c(0.180469772578534, 0.253982943390052, - 0.565547284031414), r_bla = c(0.20243920720412, 0.131404525061301, - 0.666156267734578), r_his = c(0.127640977315251, 0.23154559484301, - 0.64081342784174), r_asi = c(0.117004133678974, 0.0690814825194423, - 0.813914383801583), r_oth = c(0.261609154774003, 0.205452215791101, - 0.532938629434896), r_mal_whi = c(0.0878974421902269, 0.122763961605585, - 0.273870391579407), r_mal_bla = c(0.0947735863198809, 0.0634964825304365, - 0.314152327312054), r_mal_his = c(0.0625367039510523, 0.125018778765655, - 0.330410674533262), r_mal_asi = c(0.0480627758705248, 0.0316681846843691, - 0.400476424017375), r_mal_oth = c(0.124066628374497, 0.100870410462599, - 0.250695886537357), r_fem_whi = c(0.0925723303883072, 0.131218981784468, - 0.291676892452007), r_fem_bla = c(0.107665620884239, 0.0679080425308649, - 0.352003940422525), r_fem_his = c(0.0651042733641988, 0.106526816077355, - 0.310402753308477), r_fem_asi = c(0.0689413578084495, 0.0374132978350732, - 0.413437959784208), r_fem_oth = c(0.137542526399505, 0.104581805328503, - 0.282242742897539)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, - -3L)) - ---- - - structure(list(state = c("DE", "DE", "DE"), county = c("001", - "005", "003"), PCT012I001 = c(105891, 149025, 331836), PCT012J001 = c(37812, - 24544, 124426), PCT012H001 = c(9346, 16954, 46921), PCT012L001 = c(3266, - 1910, 23132), PCT012M001 = c(74, 62, 102), PCT012K001 = c(916, - 924, 984), PCT012N001 = c(347, 305, 873), PCT012O001 = c(4658, - 3421, 10205), PCT012I003 = c(617, 593, 1507), PCT012I004 = c(631, - 603, 1546), PCT012I005 = c(613, 647, 1540), PCT012I006 = c(636, - 683, 1573), PCT012I007 = c(662, 625, 1627), PCT012I107 = c(563, - 605, 1377), PCT012I108 = c(607, 590, 1424), PCT012I109 = c(601, - 575, 1553), PCT012I110 = c(551, 618, 1548), PCT012I111 = c(591, - 647, 1595), PCT012J003 = c(280, 216, 872), PCT012J004 = c(272, - 212, 897), PCT012J005 = c(260, 182, 945), PCT012J006 = c(309, - 182, 881), PCT012J007 = c(277, 204, 857), PCT012J107 = c(260, - 197, 865), PCT012J108 = c(256, 177, 862), PCT012J109 = c(273, - 207, 928), PCT012J110 = c(273, 188, 875), PCT012J111 = c(275, - 183, 868), PCT012H003 = c(116, 254, 523), PCT012H004 = c(121, - 259, 530), PCT012H005 = c(138, 252, 551), PCT012H006 = c(113, - 223, 558), PCT012H007 = c(112, 265, 573), PCT012H107 = c(126, - 220, 520), PCT012H108 = c(95, 243, 478), PCT012H109 = c(137, - 252, 525), PCT012H110 = c(124, 242, 584), PCT012H111 = c(134, - 241, 523), PCT012L003 = c(18, 7, 175), PCT012M003 = c(0, 0, 0 - ), PCT012L004 = c(9, 11, 167), PCT012M004 = c(0, 0, 2), PCT012L005 = c(17, - 14, 184), PCT012M005 = c(0, 3, 0), PCT012L006 = c(25, 8, 181), - PCT012M006 = c(0, 0, 0), PCT012L007 = c(15, 9, 200), PCT012M007 = c(0, - 1, 1), PCT012L107 = c(17, 9, 171), PCT012M107 = c(1, 1, 0 - ), PCT012L108 = c(18, 9, 175), PCT012M108 = c(0, 0, 1), PCT012L109 = c(23, - 11, 178), PCT012M109 = c(0, 0, 0), PCT012L110 = c(21, 9, - 190), PCT012M110 = c(1, 0, 0), PCT012L111 = c(24, 15, 148 - ), PCT012M111 = c(0, 1, 0), PCT012K003 = c(7, 2, 10), PCT012N003 = c(7, - 1, 13), PCT012O003 = c(91, 66, 176), PCT012K004 = c(6, 4, - 7), PCT012N004 = c(2, 11, 7), PCT012O004 = c(84, 73, 191), - PCT012K005 = c(3, 4, 2), PCT012N005 = c(3, 7, 11), PCT012O005 = c(67, - 69, 180), PCT012K006 = c(2, 6, 4), PCT012N006 = c(9, 5, 18 - ), PCT012O006 = c(88, 67, 168), PCT012K007 = c(4, 2, 3), - PCT012N007 = c(4, 6, 14), PCT012O007 = c(88, 67, 172), PCT012K107 = c(2, - 2, 6), PCT012N107 = c(6, 6, 13), PCT012O107 = c(87, 69, 184 - ), PCT012K108 = c(2, 3, 8), PCT012N108 = c(8, 4, 10), PCT012O108 = c(84, - 78, 190), PCT012K109 = c(4, 1, 6), PCT012N109 = c(6, 1, 9 - ), PCT012O109 = c(80, 79, 188), PCT012K110 = c(2, 7, 9), - PCT012N110 = c(11, 3, 11), PCT012O110 = c(87, 68, 168), PCT012K111 = c(4, - 4, 5), PCT012N111 = c(6, 3, 17), PCT012O111 = c(84, 76, 171 - ), PCT012I008 = c(645, 646, 1592), PCT012I009 = c(669, 672, - 1751), PCT012I010 = c(646, 643, 1717), PCT012I011 = c(624, - 692, 1646), PCT012I012 = c(681, 731, 1788), PCT012I112 = c(591, - 609, 1630), PCT012I113 = c(612, 595, 1604), PCT012I114 = c(609, - 644, 1588), PCT012I115 = c(607, 617, 1564), PCT012I116 = c(611, - 660, 1698), PCT012J008 = c(265, 200, 913), PCT012J009 = c(286, - 199, 901), PCT012J010 = c(279, 167, 951), PCT012J011 = c(265, - 183, 985), PCT012J012 = c(317, 161, 953), PCT012J112 = c(298, - 192, 851), PCT012J113 = c(325, 170, 904), PCT012J114 = c(267, - 181, 874), PCT012J115 = c(287, 187, 899), PCT012J116 = c(288, - 206, 988), PCT012H008 = c(104, 235, 523), PCT012H009 = c(112, - 201, 512), PCT012H010 = c(111, 204, 521), PCT012H011 = c(113, - 189, 478), PCT012H012 = c(85, 157, 495), PCT012H112 = c(111, - 234, 506), PCT012H113 = c(96, 169, 530), PCT012H114 = c(123, - 200, 496), PCT012H115 = c(86, 165, 448), PCT012H116 = c(88, - 143, 446), PCT012L008 = c(23, 12, 158), PCT012M008 = c(1, - 0, 0), PCT012L009 = c(21, 9, 163), PCT012M009 = c(1, 0, 1 - ), PCT012L010 = c(18, 13, 166), PCT012M010 = c(0, 1, 2), - PCT012L011 = c(22, 26, 148), PCT012M011 = c(0, 1, 0), PCT012L012 = c(25, - 12, 150), PCT012M012 = c(0, 0, 0), PCT012L112 = c(17, 17, - 209), PCT012M112 = c(0, 0, 1), PCT012L113 = c(19, 20, 196 - ), PCT012M113 = c(1, 2, 2), PCT012L114 = c(25, 17, 166), - PCT012M114 = c(1, 0, 1), PCT012L115 = c(23, 11, 157), PCT012M115 = c(0, - 0, 1), PCT012L116 = c(31, 8, 120), PCT012M116 = c(1, 0, 2 - ), PCT012K008 = c(5, 7, 10), PCT012N008 = c(3, 4, 9), PCT012O008 = c(73, - 72, 170), PCT012K009 = c(4, 3, 6), PCT012N009 = c(7, 0, 9 - ), PCT012O009 = c(68, 57, 161), PCT012K010 = c(0, 9, 7), - PCT012N010 = c(2, 5, 10), PCT012O010 = c(65, 51, 160), PCT012K011 = c(4, - 8, 5), PCT012N011 = c(7, 5, 5), PCT012O011 = c(80, 58, 144 - ), PCT012K012 = c(2, 5, 8), PCT012N012 = c(13, 4, 12), PCT012O012 = c(78, - 51, 130), PCT012K112 = c(3, 6, 7), PCT012N112 = c(1, 3, 8 - ), PCT012O112 = c(86, 50, 160), PCT012K113 = c(1, 7, 3), - PCT012N113 = c(7, 7, 14), PCT012O113 = c(82, 56, 173), PCT012K114 = c(8, - 5, 7), PCT012N114 = c(6, 3, 13), PCT012O114 = c(64, 57, 139 - ), PCT012K115 = c(3, 2, 8), PCT012N115 = c(9, 6, 8), PCT012O115 = c(77, - 60, 154), PCT012K116 = c(5, 2, 9), PCT012N116 = c(3, 8, 7 - ), PCT012O116 = c(90, 65, 172), PCT012I013 = c(668, 699, - 1828), PCT012I014 = c(670, 682, 1832), PCT012I015 = c(663, - 684, 1872), PCT012I016 = c(639, 678, 1843), PCT012I017 = c(652, - 719, 1941), PCT012I117 = c(631, 646, 1736), PCT012I118 = c(625, - 678, 1701), PCT012I119 = c(617, 651, 1785), PCT012I120 = c(573, - 683, 1785), PCT012I121 = c(598, 709, 1863), PCT012J013 = c(320, - 196, 1000), PCT012J014 = c(293, 211, 1008), PCT012J015 = c(336, - 187, 997), PCT012J016 = c(282, 179, 1013), PCT012J017 = c(310, - 178, 1035), PCT012J117 = c(265, 168, 1015), PCT012J118 = c(308, - 164, 965), PCT012J119 = c(305, 181, 944), PCT012J120 = c(299, - 195, 929), PCT012J121 = c(282, 169, 972), PCT012H013 = c(78, - 165, 482), PCT012H014 = c(79, 123, 410), PCT012H015 = c(78, - 138, 444), PCT012H016 = c(72, 114, 413), PCT012H017 = c(74, - 126, 398), PCT012H117 = c(109, 169, 464), PCT012H118 = c(94, - 139, 458), PCT012H119 = c(84, 124, 419), PCT012H120 = c(86, - 130, 393), PCT012H121 = c(67, 133, 390), PCT012L013 = c(22, - 6, 151), PCT012M013 = c(2, 0, 2), PCT012L014 = c(17, 22, - 119), PCT012M014 = c(0, 0, 0), PCT012L015 = c(27, 8, 113), - PCT012M015 = c(0, 0, 0), PCT012L016 = c(15, 12, 122), PCT012M016 = c(1, - 0, 2), PCT012L017 = c(22, 5, 156), PCT012M017 = c(1, 0, 0 - ), PCT012L117 = c(17, 19, 145), PCT012M117 = c(0, 0, 3), - PCT012L118 = c(22, 14, 133), PCT012M118 = c(1, 1, 1), PCT012L119 = c(17, - 14, 130), PCT012M119 = c(1, 0, 1), PCT012L120 = c(17, 20, - 123), PCT012M120 = c(0, 0, 0), PCT012L121 = c(16, 17, 109 - ), PCT012M121 = c(0, 0, 1), PCT012K013 = c(5, 6, 8), PCT012N013 = c(5, - 7, 4), PCT012O013 = c(70, 61, 134), PCT012K014 = c(8, 3, - 6), PCT012N014 = c(5, 4, 11), PCT012O014 = c(59, 47, 100), - PCT012K015 = c(4, 5, 6), PCT012N015 = c(3, 5, 14), PCT012O015 = c(72, - 40, 151), PCT012K016 = c(6, 4, 5), PCT012N016 = c(4, 2, 5 - ), PCT012O016 = c(68, 30, 118), PCT012K017 = c(5, 5, 7), - PCT012N017 = c(6, 3, 12), PCT012O017 = c(59, 45, 92), PCT012K117 = c(4, - 1, 12), PCT012N117 = c(4, 4, 8), PCT012O117 = c(64, 67, 143 - ), PCT012K118 = c(5, 3, 5), PCT012N118 = c(7, 1, 4), PCT012O118 = c(72, - 46, 139), PCT012K119 = c(10, 3, 3), PCT012N119 = c(6, 9, - 11), PCT012O119 = c(62, 46, 109), PCT012K120 = c(5, 2, 8), - PCT012N120 = c(5, 5, 7), PCT012O120 = c(67, 37, 124), PCT012K121 = c(2, - 6, 3), PCT012N121 = c(1, 3, 9), PCT012O121 = c(71, 33, 136 - ), PCT012I018 = c(701, 810, 1931), PCT012I019 = c(673, 754, - 2123), PCT012I020 = c(666, 788, 2188), PCT012I122 = c(640, - 719, 1958), PCT012I123 = c(644, 799, 1911), PCT012I124 = c(658, - 785, 2087), PCT012J018 = c(266, 204, 1087), PCT012J019 = c(312, - 230, 1084), PCT012J020 = c(336, 196, 1157), PCT012J122 = c(269, - 204, 1031), PCT012J123 = c(332, 222, 1054), PCT012J124 = c(335, - 213, 1085), PCT012H018 = c(86, 145, 420), PCT012H019 = c(99, - 123, 414), PCT012H020 = c(86, 143, 456), PCT012H122 = c(88, - 102, 373), PCT012H123 = c(75, 122, 416), PCT012H124 = c(80, - 108, 383), PCT012L018 = c(19, 11, 148), PCT012M018 = c(2, - 0, 1), PCT012L019 = c(25, 11, 128), PCT012M019 = c(0, 0, - 1), PCT012L020 = c(15, 15, 122), PCT012M020 = c(0, 1, 0), - PCT012L122 = c(19, 13, 118), PCT012M122 = c(0, 1, 2), PCT012L123 = c(19, - 17, 111), PCT012M123 = c(1, 0, 1), PCT012L124 = c(12, 18, - 130), PCT012M124 = c(2, 0, 1), PCT012K018 = c(10, 8, 10), - PCT012N018 = c(2, 4, 5), PCT012O018 = c(58, 34, 136), PCT012K019 = c(10, - 8, 10), PCT012N019 = c(4, 6, 10), PCT012O019 = c(49, 36, - 95), PCT012K020 = c(5, 4, 7), PCT012N020 = c(4, 6, 8), PCT012O020 = c(38, - 43, 103), PCT012K122 = c(3, 4, 2), PCT012N122 = c(4, 4, 8 - ), PCT012O122 = c(49, 43, 109), PCT012K123 = c(10, 8, 15), - PCT012N123 = c(4, 4, 7), PCT012O123 = c(59, 46, 114), PCT012K124 = c(4, - 3, 10), PCT012N124 = c(5, 3, 4), PCT012O124 = c(46, 36, 93 - ), PCT012I021 = c(663, 764, 2448), PCT012I022 = c(726, 657, - 2843), PCT012I125 = c(636, 721, 2703), PCT012I126 = c(675, - 590, 3024), PCT012J021 = c(518, 214, 1123), PCT012J022 = c(475, - 222, 1064), PCT012J125 = c(538, 194, 1060), PCT012J126 = c(587, - 180, 954), PCT012H021 = c(69, 129, 466), PCT012H022 = c(105, - 147, 479), PCT012H125 = c(104, 100, 462), PCT012H126 = c(103, - 94, 482), PCT012L021 = c(29, 10, 162), PCT012M021 = c(0, - 0, 1), PCT012L022 = c(18, 8, 197), PCT012M022 = c(0, 1, 1 - ), PCT012L125 = c(18, 12, 167), PCT012M125 = c(0, 1, 0), - PCT012L126 = c(11, 13, 182), PCT012M126 = c(0, 0, 0), PCT012K021 = c(7, - 3, 9), PCT012N021 = c(6, 1, 5), PCT012O021 = c(46, 27, 93 - ), PCT012K022 = c(6, 3, 8), PCT012N022 = c(7, 2, 4), PCT012O022 = c(57, - 30, 109), PCT012K125 = c(7, 4, 6), PCT012N125 = c(1, 2, 13 - ), PCT012O125 = c(61, 23, 101), PCT012K126 = c(6, 6, 6), - PCT012N126 = c(2, 0, 9), PCT012O126 = c(65, 35, 118), PCT012I023 = c(678, - 708, 2730), PCT012I127 = c(643, 641, 3067), PCT012J023 = c(445, - 188, 1016), PCT012J127 = c(559, 166, 961), PCT012H023 = c(101, - 163, 508), PCT012H127 = c(77, 99, 457), PCT012L023 = c(12, - 12, 216), PCT012M023 = c(3, 0, 1), PCT012L127 = c(24, 8, - 173), PCT012M127 = c(3, 1, 1), PCT012K023 = c(5, 6, 6), PCT012N023 = c(4, - 3, 9), PCT012O023 = c(49, 13, 96), PCT012K127 = c(4, 6, 8 - ), PCT012N127 = c(3, 1, 8), PCT012O127 = c(32, 18, 109), - PCT012I024 = c(738, 577, 2685), PCT012I128 = c(735, 589, - 2939), PCT012J024 = c(408, 165, 904), PCT012J128 = c(443, - 166, 864), PCT012H024 = c(91, 185, 485), PCT012H128 = c(97, - 124, 429), PCT012L024 = c(15, 11, 138), PCT012M024 = c(1, - 0, 3), PCT012L128 = c(15, 5, 183), PCT012M128 = c(1, 0, 2 - ), PCT012K024 = c(4, 4, 4), PCT012N024 = c(4, 2, 7), PCT012O024 = c(35, - 26, 67), PCT012K128 = c(12, 6, 11), PCT012N128 = c(4, 4, - 12), PCT012O128 = c(49, 20, 103), PCT012I025 = c(716, 597, - 2322), PCT012I026 = c(712, 649, 2159), PCT012I027 = c(694, - 705, 2124), PCT012I129 = c(650, 609, 2423), PCT012I130 = c(655, - 659, 1934), PCT012I131 = c(705, 702, 2046), PCT012J025 = c(385, - 151, 868), PCT012J026 = c(310, 170, 885), PCT012J027 = c(230, - 161, 819), PCT012J129 = c(379, 179, 886), PCT012J130 = c(302, - 172, 835), PCT012J131 = c(244, 193, 929), PCT012H025 = c(94, - 201, 446), PCT012H026 = c(106, 192, 496), PCT012H027 = c(102, - 230, 455), PCT012H129 = c(87, 162, 394), PCT012H130 = c(94, - 155, 358), PCT012H131 = c(79, 165, 404), PCT012L025 = c(16, - 12, 151), PCT012M025 = c(2, 0, 1), PCT012L026 = c(32, 13, - 149), PCT012M026 = c(1, 1, 0), PCT012L027 = c(24, 9, 180), - PCT012M027 = c(0, 1, 3), PCT012L129 = c(19, 7, 151), PCT012M129 = c(0, - 1, 1), PCT012L130 = c(29, 13, 148), PCT012M130 = c(2, 0, - 2), PCT012L131 = c(19, 11, 206), PCT012M131 = c(1, 0, 1), - PCT012K025 = c(10, 2, 4), PCT012N025 = c(2, 5, 7), PCT012O025 = c(38, - 16, 64), PCT012K026 = c(7, 1, 3), PCT012N026 = c(1, 2, 3), - PCT012O026 = c(25, 11, 57), PCT012K027 = c(7, 6, 5), PCT012N027 = c(4, - 1, 10), PCT012O027 = c(27, 14, 56), PCT012K129 = c(5, 7, - 4), PCT012N129 = c(1, 2, 8), PCT012O129 = c(36, 20, 91), - PCT012K130 = c(5, 1, 4), PCT012N130 = c(2, 4, 6), PCT012O130 = c(40, - 22, 58), PCT012K131 = c(5, 7, 8), PCT012N131 = c(2, 0, 8), - PCT012O131 = c(37, 15, 83), PCT012I028 = c(656, 657, 2154 - ), PCT012I029 = c(702, 691, 1974), PCT012I030 = c(646, 684, - 2106), PCT012I031 = c(626, 680, 1989), PCT012I032 = c(647, - 690, 1964), PCT012I132 = c(678, 668, 1981), PCT012I133 = c(682, - 688, 1935), PCT012I134 = c(666, 666, 2019), PCT012I135 = c(671, - 689, 1973), PCT012I136 = c(683, 718, 2106), PCT012J028 = c(229, - 188, 811), PCT012J029 = c(225, 155, 802), PCT012J030 = c(233, - 160, 842), PCT012J031 = c(240, 141, 767), PCT012J032 = c(192, - 162, 797), PCT012J132 = c(266, 164, 867), PCT012J133 = c(241, - 154, 848), PCT012J134 = c(277, 154, 826), PCT012J135 = c(250, - 177, 920), PCT012J136 = c(255, 169, 881), PCT012H028 = c(79, - 260, 483), PCT012H029 = c(82, 246, 470), PCT012H030 = c(102, - 227, 496), PCT012H031 = c(83, 219, 481), PCT012H032 = c(85, - 217, 485), PCT012H132 = c(70, 161, 394), PCT012H133 = c(77, - 163, 424), PCT012H134 = c(96, 174, 440), PCT012H135 = c(83, - 187, 433), PCT012H136 = c(100, 150, 401), PCT012L028 = c(13, - 7, 180), PCT012M028 = c(0, 1, 1), PCT012L029 = c(17, 9, 196 - ), PCT012M029 = c(2, 0, 1), PCT012L030 = c(24, 9, 258), PCT012M030 = c(1, - 0, 1), PCT012L031 = c(15, 12, 260), PCT012M031 = c(0, 0, - 0), PCT012L032 = c(18, 11, 251), PCT012M032 = c(1, 2, 0), - PCT012L132 = c(28, 14, 238), PCT012M132 = c(0, 0, 2), PCT012L133 = c(16, - 15, 258), PCT012M133 = c(1, 1, 0), PCT012L134 = c(22, 8, - 274), PCT012M134 = c(0, 0, 0), PCT012L135 = c(33, 18, 290 - ), PCT012M135 = c(0, 1, 2), PCT012L136 = c(26, 9, 286), PCT012M136 = c(0, - 1, 1), PCT012K028 = c(4, 5, 13), PCT012N028 = c(0, 4, 8), - PCT012O028 = c(32, 17, 46), PCT012K029 = c(7, 3, 5), PCT012N029 = c(1, - 2, 9), PCT012O029 = c(28, 9, 56), PCT012K030 = c(1, 7, 3), - PCT012N030 = c(0, 0, 8), PCT012O030 = c(32, 18, 61), PCT012K031 = c(7, - 2, 5), PCT012N031 = c(0, 6, 5), PCT012O031 = c(24, 14, 53 - ), PCT012K032 = c(3, 1, 6), PCT012N032 = c(2, 1, 11), PCT012O032 = c(25, - 17, 43), PCT012K132 = c(8, 3, 7), PCT012N132 = c(1, 2, 6), - PCT012O132 = c(36, 19, 59), PCT012K133 = c(6, 3, 4), PCT012N133 = c(2, - 2, 6), PCT012O133 = c(24, 15, 64), PCT012K134 = c(4, 1, 7 - ), PCT012N134 = c(6, 2, 6), PCT012O134 = c(30, 22, 72), PCT012K135 = c(9, - 8, 5), PCT012N135 = c(2, 0, 12), PCT012O135 = c(29, 21, 65 - ), PCT012K136 = c(3, 3, 4), PCT012N136 = c(1, 1, 6), PCT012O136 = c(28, - 19, 82), PCT012I033 = c(629, 664, 2026), PCT012I034 = c(621, - 690, 1855), PCT012I035 = c(626, 662, 1912), PCT012I036 = c(513, - 622, 1778), PCT012I037 = c(574, 576, 1682), PCT012I137 = c(667, - 672, 2047), PCT012I138 = c(632, 619, 1907), PCT012I139 = c(667, - 645, 1909), PCT012I140 = c(641, 624, 1780), PCT012I141 = c(619, - 608, 1757), PCT012J033 = c(194, 166, 880), PCT012J034 = c(176, - 148, 831), PCT012J035 = c(178, 130, 788), PCT012J036 = c(140, - 125, 720), PCT012J037 = c(216, 112, 722), PCT012J137 = c(249, - 159, 998), PCT012J138 = c(253, 154, 877), PCT012J139 = c(239, - 167, 853), PCT012J140 = c(232, 120, 844), PCT012J141 = c(229, - 146, 884), PCT012H033 = c(86, 217, 505), PCT012H034 = c(69, - 185, 450), PCT012H035 = c(65, 177, 401), PCT012H036 = c(72, - 168, 445), PCT012H037 = c(74, 183, 383), PCT012H137 = c(79, - 172, 441), PCT012H138 = c(63, 145, 389), PCT012H139 = c(70, - 156, 398), PCT012H140 = c(71, 156, 398), PCT012H141 = c(62, - 148, 377), PCT012L033 = c(16, 8, 231), PCT012M033 = c(0, - 0, 1), PCT012L034 = c(16, 17, 264), PCT012M034 = c(0, 0, - 2), PCT012L035 = c(20, 13, 273), PCT012M035 = c(0, 0, 1), - PCT012L036 = c(20, 19, 257), PCT012M036 = c(0, 0, 0), PCT012L037 = c(19, - 14, 250), PCT012M037 = c(0, 0, 0), PCT012L137 = c(25, 11, - 257), PCT012M137 = c(0, 1, 0), PCT012L138 = c(20, 11, 230 - ), PCT012M138 = c(0, 1, 0), PCT012L139 = c(33, 10, 264), - PCT012M139 = c(2, 0, 0), PCT012L140 = c(30, 12, 235), PCT012M140 = c(1, - 1, 1), PCT012L141 = c(33, 22, 229), PCT012M141 = c(1, 0, - 0), PCT012K033 = c(6, 4, 3), PCT012N033 = c(1, 1, 6), PCT012O033 = c(26, - 14, 64), PCT012K034 = c(6, 4, 2), PCT012N034 = c(3, 2, 5), - PCT012O034 = c(16, 9, 49), PCT012K035 = c(6, 6, 2), PCT012N035 = c(2, - 2, 8), PCT012O035 = c(25, 17, 54), PCT012K036 = c(3, 2, 6 - ), PCT012N036 = c(1, 2, 3), PCT012O036 = c(15, 14, 43), PCT012K037 = c(6, - 2, 10), PCT012N037 = c(0, 0, 5), PCT012O037 = c(12, 15, 50 - ), PCT012K137 = c(5, 4, 6), PCT012N137 = c(1, 1, 7), PCT012O137 = c(22, - 23, 88), PCT012K138 = c(2, 2, 10), PCT012N138 = c(2, 1, 13 - ), PCT012O138 = c(19, 13, 70), PCT012K139 = c(8, 6, 5), PCT012N139 = c(2, - 3, 5), PCT012O139 = c(28, 12, 58), PCT012K140 = c(4, 2, 9 - ), PCT012N140 = c(0, 2, 6), PCT012O140 = c(32, 13, 51), PCT012K141 = c(4, - 5, 5), PCT012N141 = c(3, 2, 9), PCT012O141 = c(21, 13, 55 - ), PCT012I038 = c(568, 602, 1874), PCT012I039 = c(551, 636, - 1751), PCT012I040 = c(584, 682, 1930), PCT012I041 = c(651, - 780, 2031), PCT012I042 = c(694, 823, 2308), PCT012I142 = c(656, - 682, 1822), PCT012I143 = c(564, 690, 1823), PCT012I144 = c(645, - 745, 1862), PCT012I145 = c(748, 784, 2031), PCT012I146 = c(714, - 817, 2338), PCT012J038 = c(195, 131, 792), PCT012J039 = c(171, - 128, 733), PCT012J040 = c(188, 143, 795), PCT012J041 = c(220, - 134, 762), PCT012J042 = c(243, 152, 1001), PCT012J142 = c(232, - 143, 885), PCT012J143 = c(228, 123, 826), PCT012J144 = c(239, - 129, 930), PCT012J145 = c(250, 131, 965), PCT012J146 = c(303, - 173, 1093), PCT012H038 = c(72, 153, 402), PCT012H039 = c(66, - 151, 374), PCT012H040 = c(49, 193, 388), PCT012H041 = c(56, - 152, 369), PCT012H042 = c(54, 118, 333), PCT012H142 = c(66, - 142, 384), PCT012H143 = c(71, 136, 376), PCT012H144 = c(62, - 148, 335), PCT012H145 = c(63, 109, 354), PCT012H146 = c(70, - 122, 335), PCT012L038 = c(20, 13, 240), PCT012M038 = c(0, - 2, 0), PCT012L039 = c(18, 10, 259), PCT012M039 = c(0, 1, - 0), PCT012L040 = c(32, 19, 224), PCT012M040 = c(1, 0, 0), - PCT012L041 = c(29, 16, 228), PCT012M041 = c(0, 2, 1), PCT012L042 = c(23, - 11, 219), PCT012M042 = c(0, 0, 0), PCT012L142 = c(36, 18, - 228), PCT012M142 = c(0, 2, 3), PCT012L143 = c(29, 18, 204 - ), PCT012M143 = c(0, 0, 2), PCT012L144 = c(31, 21, 217), - PCT012M144 = c(0, 0, 0), PCT012L145 = c(31, 15, 247), PCT012M145 = c(1, - 1, 2), PCT012L146 = c(32, 18, 193), PCT012M146 = c(0, 2, - 1), PCT012K038 = c(8, 6, 12), PCT012N038 = c(4, 2, 10), PCT012O038 = c(15, - 12, 44), PCT012K039 = c(7, 4, 13), PCT012N039 = c(0, 1, 6 - ), PCT012O039 = c(21, 6, 47), PCT012K040 = c(4, 3, 3), PCT012N040 = c(0, - 3, 8), PCT012O040 = c(15, 10, 39), PCT012K041 = c(2, 4, 7 - ), PCT012N041 = c(2, 0, 8), PCT012O041 = c(19, 5, 44), PCT012K042 = c(7, - 5, 4), PCT012N042 = c(0, 2, 7), PCT012O042 = c(19, 21, 49 - ), PCT012K142 = c(6, 10, 5), PCT012N142 = c(2, 1, 6), PCT012O142 = c(30, - 11, 63), PCT012K143 = c(2, 2, 10), PCT012N143 = c(1, 0, 5 - ), PCT012O143 = c(21, 6, 48), PCT012K144 = c(10, 7, 8), PCT012N144 = c(1, - 3, 4), PCT012O144 = c(23, 13, 57), PCT012K145 = c(3, 3, 7 - ), PCT012N145 = c(3, 2, 4), PCT012O145 = c(18, 13, 47), PCT012K146 = c(8, - 8, 5), PCT012N146 = c(2, 4, 6), PCT012O146 = c(15, 12, 59 - ), PCT012I043 = c(729, 839, 2167), PCT012I044 = c(710, 771, - 2177), PCT012I045 = c(652, 803, 2141), PCT012I046 = c(747, - 893, 2367), PCT012I047 = c(765, 976, 2383), PCT012I147 = c(764, - 857, 2325), PCT012I148 = c(646, 841, 2257), PCT012I149 = c(700, - 862, 2272), PCT012I150 = c(758, 904, 2360), PCT012I151 = c(789, - 971, 2434), PCT012J043 = c(222, 158, 923), PCT012J044 = c(190, - 154, 834), PCT012J045 = c(242, 159, 835), PCT012J046 = c(225, - 173, 881), PCT012J047 = c(236, 181, 881), PCT012J147 = c(243, - 178, 1078), PCT012J148 = c(276, 160, 1002), PCT012J149 = c(258, - 178, 1144), PCT012J150 = c(270, 200, 1030), PCT012J151 = c(274, - 202, 1146), PCT012H043 = c(52, 149, 369), PCT012H044 = c(42, - 138, 341), PCT012H045 = c(54, 101, 315), PCT012H046 = c(43, - 93, 305), PCT012H047 = c(49, 84, 307), PCT012H147 = c(53, - 112, 349), PCT012H148 = c(59, 83, 302), PCT012H149 = c(66, - 77, 310), PCT012H150 = c(47, 82, 331), PCT012H151 = c(40, - 94, 289), PCT012L043 = c(28, 14, 218), PCT012M043 = c(1, - 0, 1), PCT012L044 = c(26, 17, 180), PCT012M044 = c(0, 0, - 0), PCT012L045 = c(27, 12, 145), PCT012M045 = c(0, 0, 0), - PCT012L046 = c(20, 19, 153), PCT012M046 = c(2, 0, 1), PCT012L047 = c(24, - 21, 162), PCT012M047 = c(0, 0, 0), PCT012L147 = c(44, 20, - 227), PCT012M147 = c(0, 2, 2), PCT012L148 = c(24, 16, 211 - ), PCT012M148 = c(0, 0, 0), PCT012L149 = c(30, 10, 138), - PCT012M149 = c(0, 0, 1), PCT012L150 = c(40, 18, 165), PCT012M150 = c(0, - 0, 1), PCT012L151 = c(31, 22, 147), PCT012M151 = c(2, 0, - 0), PCT012K043 = c(4, 1, 6), PCT012N043 = c(1, 4, 7), PCT012O043 = c(12, - 15, 60), PCT012K044 = c(6, 10, 10), PCT012N044 = c(0, 1, - 3), PCT012O044 = c(13, 10, 44), PCT012K045 = c(7, 3, 10), - PCT012N045 = c(1, 2, 2), PCT012O045 = c(13, 11, 34), PCT012K046 = c(5, - 5, 5), PCT012N046 = c(1, 0, 4), PCT012O046 = c(15, 8, 43), - PCT012K047 = c(5, 12, 3), PCT012N047 = c(1, 2, 3), PCT012O047 = c(13, - 12, 36), PCT012K147 = c(7, 3, 5), PCT012N147 = c(3, 1, 4), - PCT012O147 = c(27, 9, 54), PCT012K148 = c(6, 5, 5), PCT012N148 = c(0, - 0, 8), PCT012O148 = c(14, 8, 59), PCT012K149 = c(7, 3, 6), - PCT012N149 = c(2, 1, 5), PCT012O149 = c(22, 5, 57), PCT012K150 = c(10, - 6, 10), PCT012N150 = c(0, 2, 4), PCT012O150 = c(22, 11, 39 - ), PCT012K151 = c(14, 10, 6), PCT012N151 = c(1, 3, 7), PCT012O151 = c(14, - 12, 47), PCT012I048 = c(811, 1065, 2584), PCT012I049 = c(842, - 1062, 2617), PCT012I050 = c(811, 1067, 2680), PCT012I051 = c(781, - 1062, 2780), PCT012I052 = c(863, 1121, 2652), PCT012I152 = c(872, - 1023, 2626), PCT012I153 = c(841, 1073, 2685), PCT012I154 = c(870, - 1150, 2751), PCT012I155 = c(858, 1129, 2779), PCT012I156 = c(839, - 1261, 2890), PCT012J048 = c(219, 170, 929), PCT012J049 = c(271, - 198, 942), PCT012J050 = c(269, 173, 898), PCT012J051 = c(237, - 185, 923), PCT012J052 = c(254, 180, 820), PCT012J152 = c(290, - 202, 1094), PCT012J153 = c(274, 174, 1097), PCT012J154 = c(292, - 205, 1117), PCT012J155 = c(254, 177, 1087), PCT012J156 = c(275, - 200, 997), PCT012H048 = c(40, 74, 281), PCT012H049 = c(45, - 77, 297), PCT012H050 = c(52, 66, 250), PCT012H051 = c(42, - 76, 252), PCT012H052 = c(50, 60, 210), PCT012H152 = c(61, - 80, 287), PCT012H153 = c(42, 67, 226), PCT012H154 = c(41, - 62, 223), PCT012H155 = c(48, 49, 199), PCT012H156 = c(53, - 53, 255), PCT012L048 = c(26, 14, 166), PCT012M048 = c(0, - 0, 1), PCT012L049 = c(24, 11, 155), PCT012M049 = c(0, 0, - 1), PCT012L050 = c(28, 12, 156), PCT012M050 = c(0, 1, 0), - PCT012L051 = c(16, 17, 147), PCT012M051 = c(0, 1, 0), PCT012L052 = c(12, - 14, 122), PCT012M052 = c(0, 0, 1), PCT012L152 = c(41, 26, - 161), PCT012M152 = c(1, 0, 0), PCT012L153 = c(33, 19, 162 - ), PCT012M153 = c(3, 1, 3), PCT012L154 = c(37, 16, 172), - PCT012M154 = c(0, 0, 1), PCT012L155 = c(35, 20, 148), PCT012M155 = c(0, - 0, 0), PCT012L156 = c(31, 17, 156), PCT012M156 = c(0, 0, - 1), PCT012K048 = c(3, 12, 8), PCT012N048 = c(2, 1, 2), PCT012O048 = c(14, - 15, 31), PCT012K049 = c(9, 17, 13), PCT012N049 = c(2, 1, - 3), PCT012O049 = c(10, 10, 48), PCT012K050 = c(8, 8, 8), - PCT012N050 = c(1, 0, 6), PCT012O050 = c(13, 11, 45), PCT012K051 = c(10, - 17, 5), PCT012N051 = c(2, 0, 4), PCT012O051 = c(16, 18, 39 - ), PCT012K052 = c(7, 9, 8), PCT012N052 = c(2, 2, 3), PCT012O052 = c(11, - 12, 35), PCT012K152 = c(9, 10, 6), PCT012N152 = c(4, 1, 4 - ), PCT012O152 = c(17, 14, 68), PCT012K153 = c(6, 9, 9), PCT012N153 = c(1, - 5, 3), PCT012O153 = c(16, 17, 39), PCT012K154 = c(9, 6, 17 - ), PCT012N154 = c(4, 0, 3), PCT012O154 = c(18, 12, 51), PCT012K155 = c(3, - 8, 7), PCT012N155 = c(0, 0, 6), PCT012O155 = c(16, 14, 50 - ), PCT012K156 = c(6, 8, 9), PCT012N156 = c(3, 4, 5), PCT012O156 = c(22, - 19, 42), PCT012I053 = c(797, 1113, 2702), PCT012I054 = c(813, - 1157, 2809), PCT012I055 = c(759, 1121, 2835), PCT012I056 = c(735, - 1084, 2718), PCT012I057 = c(727, 1143, 2661), PCT012I157 = c(869, - 1143, 2958), PCT012I158 = c(807, 1185, 2836), PCT012I159 = c(829, - 1290, 2927), PCT012I160 = c(829, 1284, 2767), PCT012I161 = c(785, - 1242, 2803), PCT012J053 = c(239, 179, 851), PCT012J054 = c(204, - 162, 832), PCT012J055 = c(204, 201, 803), PCT012J056 = c(215, - 141, 777), PCT012J057 = c(201, 182, 749), PCT012J157 = c(256, - 222, 1080), PCT012J158 = c(258, 216, 946), PCT012J159 = c(275, - 199, 997), PCT012J160 = c(258, 173, 970), PCT012J161 = c(248, - 163, 890), PCT012H053 = c(35, 75, 211), PCT012H054 = c(41, - 53, 207), PCT012H055 = c(28, 39, 193), PCT012H056 = c(35, - 43, 189), PCT012H057 = c(33, 49, 157), PCT012H157 = c(33, - 52, 173), PCT012H158 = c(39, 43, 156), PCT012H159 = c(38, - 43, 173), PCT012H160 = c(38, 34, 157), PCT012H161 = c(45, - 40, 157), PCT012L053 = c(17, 18, 147), PCT012M053 = c(1, - 0, 1), PCT012L054 = c(18, 14, 119), PCT012M054 = c(1, 0, - 0), PCT012L055 = c(18, 15, 126), PCT012M055 = c(1, 2, 0), - PCT012L056 = c(13, 12, 113), PCT012M056 = c(1, 0, 1), PCT012L057 = c(15, - 5, 115), PCT012M057 = c(2, 1, 0), PCT012L157 = c(25, 15, - 140), PCT012M157 = c(1, 1, 2), PCT012L158 = c(34, 19, 121 - ), PCT012M158 = c(3, 1, 0), PCT012L159 = c(19, 13, 113), - PCT012M159 = c(1, 0, 0), PCT012L160 = c(28, 15, 143), PCT012M160 = c(3, - 1, 0), PCT012L161 = c(28, 16, 126), PCT012M161 = c(0, 0, - 0), PCT012K053 = c(10, 13, 9), PCT012N053 = c(1, 1, 4), PCT012O053 = c(10, - 7, 27), PCT012K054 = c(3, 13, 8), PCT012N054 = c(1, 0, 5), - PCT012O054 = c(20, 11, 30), PCT012K055 = c(8, 9, 7), PCT012N055 = c(1, - 0, 6), PCT012O055 = c(14, 13, 26), PCT012K056 = c(5, 13, - 9), PCT012N056 = c(0, 0, 6), PCT012O056 = c(10, 13, 29), - PCT012K057 = c(17, 5, 11), PCT012N057 = c(1, 0, 5), PCT012O057 = c(13, - 11, 30), PCT012K157 = c(12, 6, 15), PCT012N157 = c(1, 1, - 4), PCT012O157 = c(17, 15, 45), PCT012K158 = c(7, 6, 10), - PCT012N158 = c(0, 1, 6), PCT012O158 = c(17, 7, 34), PCT012K159 = c(9, - 9, 11), PCT012N159 = c(1, 2, 3), PCT012O159 = c(18, 11, 31 - ), PCT012K160 = c(7, 7, 10), PCT012N160 = c(3, 1, 4), PCT012O160 = c(16, - 13, 37), PCT012K161 = c(10, 3, 7), PCT012N161 = c(3, 2, 6 - ), PCT012O161 = c(12, 8, 43), PCT012I058 = c(673, 1133, 2554 - ), PCT012I059 = c(682, 1169, 2356), PCT012I060 = c(622, 1131, - 2476), PCT012I061 = c(628, 1114, 2265), PCT012I062 = c(593, - 1166, 2124), PCT012I162 = c(766, 1267, 2624), PCT012I163 = c(717, - 1287, 2546), PCT012I164 = c(716, 1360, 2612), PCT012I165 = c(711, - 1345, 2464), PCT012I166 = c(673, 1311, 2374), PCT012J058 = c(181, - 139, 694), PCT012J059 = c(188, 147, 602), PCT012J060 = c(187, - 162, 572), PCT012J061 = c(170, 116, 510), PCT012J062 = c(176, - 132, 591), PCT012J162 = c(247, 148, 881), PCT012J163 = c(241, - 171, 767), PCT012J164 = c(228, 165, 789), PCT012J165 = c(230, - 172, 761), PCT012J166 = c(199, 141, 708), PCT012H058 = c(36, - 40, 168), PCT012H059 = c(18, 47, 152), PCT012H060 = c(30, - 32, 141), PCT012H061 = c(21, 37, 105), PCT012H062 = c(24, - 24, 113), PCT012H162 = c(25, 32, 139), PCT012H163 = c(33, - 37, 142), PCT012H164 = c(18, 33, 117), PCT012H165 = c(12, - 26, 119), PCT012H166 = c(25, 33, 109), PCT012L058 = c(13, - 7, 117), PCT012M058 = c(0, 0, 2), PCT012L059 = c(7, 14, 98 - ), PCT012M059 = c(0, 0, 1), PCT012L060 = c(14, 9, 97), PCT012M060 = c(1, - 1, 0), PCT012L061 = c(7, 6, 88), PCT012M061 = c(0, 0, 0), - PCT012L062 = c(10, 7, 88), PCT012M062 = c(0, 0, 0), PCT012L162 = c(35, - 18, 131), PCT012M162 = c(0, 0, 2), PCT012L163 = c(26, 13, - 101), PCT012M163 = c(1, 0, 1), PCT012L164 = c(20, 13, 100 - ), PCT012M164 = c(0, 0, 0), PCT012L165 = c(31, 6, 100), PCT012M165 = c(1, - 0, 0), PCT012L166 = c(21, 10, 115), PCT012M166 = c(0, 0, - 0), PCT012K058 = c(9, 12, 9), PCT012N058 = c(0, 1, 1), PCT012O058 = c(11, - 7, 15), PCT012K059 = c(7, 6, 8), PCT012N059 = c(1, 0, 6), - PCT012O059 = c(10, 7, 24), PCT012K060 = c(7, 4, 7), PCT012N060 = c(0, - 0, 2), PCT012O060 = c(6, 9, 24), PCT012K061 = c(10, 9, 12 - ), PCT012N061 = c(0, 3, 4), PCT012O061 = c(8, 8, 19), PCT012K062 = c(2, - 12, 4), PCT012N062 = c(0, 1, 3), PCT012O062 = c(4, 6, 21), - PCT012K162 = c(7, 14, 6), PCT012N162 = c(1, 2, 1), PCT012O162 = c(9, - 9, 37), PCT012K163 = c(11, 5, 7), PCT012N163 = c(1, 0, 2), - PCT012O163 = c(14, 12, 34), PCT012K164 = c(7, 7, 11), PCT012N164 = c(0, - 2, 5), PCT012O164 = c(9, 7, 35), PCT012K165 = c(7, 6, 2), - PCT012N165 = c(0, 0, 2), PCT012O165 = c(12, 12, 30), PCT012K166 = c(12, - 7, 8), PCT012N166 = c(0, 1, 4), PCT012O166 = c(6, 8, 31), - PCT012I063 = c(645, 1264, 2040), PCT012I064 = c(628, 1287, - 2088), PCT012I167 = c(638, 1364, 2305), PCT012I168 = c(744, - 1466, 2204), PCT012J063 = c(158, 107, 593), PCT012J064 = c(155, - 119, 508), PCT012J167 = c(215, 137, 680), PCT012J168 = c(202, - 137, 685), PCT012H063 = c(19, 29, 108), PCT012H064 = c(23, - 30, 94), PCT012H167 = c(32, 29, 99), PCT012H168 = c(29, 23, - 101), PCT012L063 = c(14, 11, 88), PCT012M063 = c(0, 1, 0), - PCT012L064 = c(13, 9, 75), PCT012M064 = c(0, 0, 0), PCT012L167 = c(22, - 11, 98), PCT012M167 = c(0, 1, 0), PCT012L168 = c(27, 16, - 83), PCT012M168 = c(0, 2, 2), PCT012K063 = c(9, 7, 1), PCT012N063 = c(0, - 0, 1), PCT012O063 = c(6, 5, 22), PCT012K064 = c(4, 4, 3), - PCT012N064 = c(2, 1, 3), PCT012O064 = c(10, 13, 18), PCT012K167 = c(8, - 6, 4), PCT012N167 = c(1, 0, 4), PCT012O167 = c(6, 8, 24), - PCT012K168 = c(8, 8, 4), PCT012N168 = c(0, 0, 2), PCT012O168 = c(7, - 9, 22), PCT012I065 = c(698, 1424, 2125), PCT012I066 = c(716, - 1481, 2257), PCT012I067 = c(511, 1173, 1444), PCT012I169 = c(746, - 1674, 2317), PCT012I170 = c(770, 1643, 2315), PCT012I171 = c(592, - 1279, 1746), PCT012J065 = c(178, 112, 489), PCT012J066 = c(162, - 110, 460), PCT012J067 = c(108, 93, 372), PCT012J169 = c(181, - 147, 682), PCT012J170 = c(187, 105, 585), PCT012J171 = c(178, - 96, 492), PCT012H065 = c(24, 16, 85), PCT012H066 = c(16, - 29, 84), PCT012H067 = c(11, 26, 64), PCT012H169 = c(28, 21, - 100), PCT012H170 = c(26, 17, 90), PCT012H171 = c(25, 30, - 82), PCT012L065 = c(12, 6, 77), PCT012M065 = c(0, 0, 1), - PCT012L066 = c(13, 14, 69), PCT012M066 = c(0, 0, 0), PCT012L067 = c(4, - 9, 70), PCT012M067 = c(0, 0, 1), PCT012L169 = c(26, 14, 109 - ), PCT012M169 = c(0, 0, 1), PCT012L170 = c(24, 6, 97), PCT012M170 = c(0, - 1, 1), PCT012L171 = c(24, 8, 83), PCT012M171 = c(0, 0, 0), - PCT012K065 = c(9, 5, 7), PCT012N065 = c(0, 1, 2), PCT012O065 = c(14, - 6, 13), PCT012K066 = c(7, 13, 3), PCT012N066 = c(1, 0, 2), - PCT012O066 = c(8, 9, 19), PCT012K067 = c(3, 6, 4), PCT012N067 = c(3, - 0, 2), PCT012O067 = c(2, 6, 12), PCT012K169 = c(10, 8, 5), - PCT012N169 = c(0, 0, 1), PCT012O169 = c(9, 16, 19), PCT012K170 = c(3, - 3, 4), PCT012N170 = c(0, 0, 1), PCT012O170 = c(6, 10, 34), - PCT012K171 = c(6, 9, 4), PCT012N171 = c(1, 0, 1), PCT012O171 = c(7, - 6, 13), PCT012I068 = c(544, 1217, 1557), PCT012I069 = c(579, - 1275, 1549), PCT012I172 = c(594, 1321, 1687), PCT012I173 = c(653, - 1407, 1759), PCT012J068 = c(110, 75, 366), PCT012J069 = c(118, - 84, 347), PCT012J172 = c(152, 118, 456), PCT012J173 = c(141, - 90, 471), PCT012H068 = c(7, 24, 71), PCT012H069 = c(10, 18, - 40), PCT012H172 = c(16, 14, 81), PCT012H173 = c(15, 19, 59 - ), PCT012L068 = c(10, 5, 67), PCT012M068 = c(0, 0, 0), PCT012L069 = c(14, - 2, 49), PCT012M069 = c(0, 0, 0), PCT012L172 = c(30, 11, 80 - ), PCT012M172 = c(0, 0, 0), PCT012L173 = c(20, 17, 64), PCT012M173 = c(0, - 0, 0), PCT012K068 = c(5, 3, 3), PCT012N068 = c(1, 0, 3), - PCT012O068 = c(9, 4, 13), PCT012K069 = c(3, 9, 3), PCT012N069 = c(0, - 0, 2), PCT012O069 = c(9, 10, 17), PCT012K172 = c(0, 3, 3), - PCT012N172 = c(1, 1, 1), PCT012O172 = c(8, 4, 12), PCT012K173 = c(3, - 1, 3), PCT012N173 = c(0, 0, 2), PCT012O173 = c(7, 8, 7), - PCT012I070 = c(598, 1359, 1497), PCT012I071 = c(504, 1113, - 1327), PCT012I072 = c(405, 1060, 1252), PCT012I174 = c(611, - 1472, 1837), PCT012I175 = c(562, 1238, 1610), PCT012I176 = c(533, - 1186, 1429), PCT012J070 = c(136, 80, 345), PCT012J071 = c(105, - 92, 300), PCT012J072 = c(88, 83, 273), PCT012J174 = c(142, - 95, 468), PCT012J175 = c(130, 101, 378), PCT012J176 = c(122, - 72, 358), PCT012H070 = c(13, 17, 47), PCT012H071 = c(7, 17, - 55), PCT012H072 = c(18, 16, 48), PCT012H174 = c(14, 14, 56 - ), PCT012H175 = c(27, 18, 53), PCT012H176 = c(14, 17, 60), - PCT012L070 = c(7, 6, 79), PCT012M070 = c(1, 0, 0), PCT012L071 = c(10, - 11, 55), PCT012M071 = c(0, 0, 0), PCT012L072 = c(8, 9, 66 - ), PCT012M072 = c(0, 0, 0), PCT012L174 = c(20, 4, 49), PCT012M174 = c(0, - 0, 0), PCT012L175 = c(26, 10, 75), PCT012M175 = c(0, 0, 0 - ), PCT012L176 = c(19, 6, 54), PCT012M176 = c(0, 1, 0), PCT012K070 = c(4, - 5, 5), PCT012N070 = c(0, 1, 0), PCT012O070 = c(4, 10, 14), - PCT012K071 = c(4, 6, 3), PCT012N071 = c(0, 0, 1), PCT012O071 = c(3, - 5, 12), PCT012K072 = c(5, 8, 0), PCT012N072 = c(0, 0, 1), - PCT012O072 = c(3, 4, 9), PCT012K174 = c(3, 5, 5), PCT012N174 = c(1, - 0, 0), PCT012O174 = c(12, 8, 17), PCT012K175 = c(3, 6, 3), - PCT012N175 = c(0, 1, 1), PCT012O175 = c(10, 11, 15), PCT012K176 = c(4, - 6, 5), PCT012N176 = c(0, 1, 1), PCT012O176 = c(9, 7, 9), - PCT012I073 = c(430, 987, 1065), PCT012I074 = c(422, 1015, - 1134), PCT012I075 = c(399, 932, 1057), PCT012I076 = c(363, - 809, 904), PCT012I077 = c(309, 809, 918), PCT012I177 = c(488, - 1087, 1294), PCT012I178 = c(442, 1063, 1320), PCT012I179 = c(495, - 972, 1243), PCT012I180 = c(433, 896, 1164), PCT012I181 = c(427, - 865, 1146), PCT012J073 = c(106, 72, 272), PCT012J074 = c(97, - 47, 236), PCT012J075 = c(91, 60, 203), PCT012J076 = c(74, - 44, 214), PCT012J077 = c(77, 38, 188), PCT012J177 = c(110, - 82, 334), PCT012J178 = c(105, 81, 320), PCT012J179 = c(109, - 68, 312), PCT012J180 = c(92, 51, 267), PCT012J181 = c(79, - 46, 292), PCT012H073 = c(13, 19, 36), PCT012H074 = c(8, 16, - 41), PCT012H075 = c(12, 11, 45), PCT012H076 = c(16, 12, 35 - ), PCT012H077 = c(5, 13, 27), PCT012H177 = c(13, 14, 52), - PCT012H178 = c(17, 13, 44), PCT012H179 = c(11, 18, 44), PCT012H180 = c(18, - 8, 30), PCT012H181 = c(6, 13, 34), PCT012L073 = c(10, 12, - 46), PCT012M073 = c(0, 0, 0), PCT012L074 = c(6, 8, 54), PCT012M074 = c(0, - 0, 0), PCT012L075 = c(10, 5, 48), PCT012M075 = c(0, 0, 0), - PCT012L076 = c(8, 6, 36), PCT012M076 = c(0, 0, 0), PCT012L077 = c(6, - 3, 50), PCT012M077 = c(0, 0, 0), PCT012L177 = c(16, 7, 58 - ), PCT012M177 = c(0, 0, 0), PCT012L178 = c(10, 7, 45), PCT012M178 = c(0, - 0, 0), PCT012L179 = c(14, 3, 58), PCT012M179 = c(0, 0, 0), - PCT012L180 = c(11, 5, 46), PCT012M180 = c(0, 0, 0), PCT012L181 = c(10, - 6, 37), PCT012M181 = c(0, 0, 0), PCT012K073 = c(5, 4, 3), - PCT012N073 = c(1, 0, 0), PCT012O073 = c(4, 6, 9), PCT012K074 = c(4, - 3, 3), PCT012N074 = c(0, 0, 0), PCT012O074 = c(5, 6, 6), - PCT012K075 = c(5, 8, 2), PCT012N075 = c(1, 0, 0), PCT012O075 = c(4, - 8, 11), PCT012K076 = c(0, 3, 1), PCT012N076 = c(0, 0, 1), - PCT012O076 = c(6, 5, 8), PCT012K077 = c(2, 9, 3), PCT012N077 = c(1, - 0, 0), PCT012O077 = c(2, 7, 4), PCT012K177 = c(3, 4, 3), - PCT012N177 = c(3, 0, 3), PCT012O177 = c(5, 4, 6), PCT012K178 = c(4, - 5, 2), PCT012N178 = c(0, 0, 0), PCT012O178 = c(7, 4, 8), - PCT012K179 = c(5, 10, 5), PCT012N179 = c(2, 0, 1), PCT012O179 = c(3, - 5, 8), PCT012K180 = c(3, 6, 2), PCT012N180 = c(0, 1, 0), - PCT012O180 = c(3, 8, 5), PCT012K181 = c(5, 6, 0), PCT012N181 = c(0, - 0, 1), PCT012O181 = c(4, 3, 7), PCT012I078 = c(360, 781, - 916), PCT012I079 = c(320, 660, 816), PCT012I080 = c(290, - 660, 812), PCT012I081 = c(300, 640, 812), PCT012I082 = c(293, - 606, 791), PCT012I182 = c(407, 800, 1158), PCT012I183 = c(342, - 720, 1078), PCT012I184 = c(366, 733, 1114), PCT012I185 = c(349, - 702, 1083), PCT012I186 = c(314, 606, 1159), PCT012J078 = c(75, - 42, 196), PCT012J079 = c(62, 34, 151), PCT012J080 = c(50, - 48, 133), PCT012J081 = c(48, 30, 129), PCT012J082 = c(50, - 31, 111), PCT012J182 = c(92, 60, 260), PCT012J183 = c(81, - 48, 197), PCT012J184 = c(85, 54, 203), PCT012J185 = c(68, - 63, 197), PCT012J186 = c(66, 55, 207), PCT012H078 = c(10, - 5, 31), PCT012H079 = c(6, 11, 19), PCT012H080 = c(5, 10, - 17), PCT012H081 = c(2, 10, 25), PCT012H082 = c(4, 3, 22), - PCT012H182 = c(10, 9, 36), PCT012H183 = c(7, 15, 38), PCT012H184 = c(9, - 4, 28), PCT012H185 = c(8, 8, 21), PCT012H186 = c(10, 4, 19 - ), PCT012L078 = c(4, 5, 29), PCT012M078 = c(0, 1, 0), PCT012L079 = c(3, - 3, 26), PCT012M079 = c(0, 0, 0), PCT012L080 = c(2, 6, 31), - PCT012M080 = c(0, 0, 0), PCT012L081 = c(2, 2, 14), PCT012M081 = c(0, - 0, 0), PCT012L082 = c(3, 1, 22), PCT012M082 = c(0, 0, 0), - PCT012L182 = c(16, 5, 33), PCT012M182 = c(0, 0, 0), PCT012L183 = c(10, - 4, 31), PCT012M183 = c(1, 1, 0), PCT012L184 = c(11, 5, 17 - ), PCT012M184 = c(0, 0, 2), PCT012L185 = c(10, 3, 26), PCT012M185 = c(0, - 0, 0), PCT012L186 = c(9, 4, 20), PCT012M186 = c(0, 0, 0), - PCT012K078 = c(2, 2, 2), PCT012N078 = c(0, 0, 1), PCT012O078 = c(1, - 6, 2), PCT012K079 = c(0, 2, 1), PCT012N079 = c(1, 0, 0), - PCT012O079 = c(3, 6, 3), PCT012K080 = c(5, 2, 0), PCT012N080 = c(0, - 1, 2), PCT012O080 = c(1, 1, 5), PCT012K081 = c(4, 2, 3), - PCT012N081 = c(0, 0, 0), PCT012O081 = c(0, 1, 7), PCT012K082 = c(2, - 2, 1), PCT012N082 = c(0, 0, 0), PCT012O082 = c(0, 1, 3), - PCT012K182 = c(2, 3, 2), PCT012N182 = c(1, 1, 0), PCT012O182 = c(7, - 7, 10), PCT012K183 = c(1, 4, 2), PCT012N183 = c(0, 0, 0), - PCT012O183 = c(4, 1, 12), PCT012K184 = c(3, 7, 1), PCT012N184 = c(0, - 2, 1), PCT012O184 = c(4, 6, 4), PCT012K185 = c(2, 3, 1), - PCT012N185 = c(1, 0, 0), PCT012O185 = c(1, 4, 3), PCT012K186 = c(3, - 6, 2), PCT012N186 = c(0, 0, 0), PCT012O186 = c(6, 4, 7), - PCT012I083 = c(235, 538, 748), PCT012I084 = c(218, 491, 680 - ), PCT012I085 = c(187, 457, 660), PCT012I086 = c(147, 333, - 588), PCT012I087 = c(142, 304, 528), PCT012I187 = c(288, - 554, 1052), PCT012I188 = c(293, 527, 971), PCT012I189 = c(243, - 548, 951), PCT012I190 = c(225, 514, 935), PCT012I191 = c(210, - 397, 824), PCT012J083 = c(26, 38, 108), PCT012J084 = c(27, - 23, 90), PCT012J085 = c(39, 23, 86), PCT012J086 = c(31, 17, - 65), PCT012J087 = c(18, 19, 46), PCT012J187 = c(42, 49, 168 - ), PCT012J188 = c(55, 51, 181), PCT012J189 = c(29, 26, 147 - ), PCT012J190 = c(38, 44, 136), PCT012J191 = c(36, 41, 105 - ), PCT012H083 = c(5, 3, 24), PCT012H084 = c(5, 6, 23), PCT012H085 = c(2, - 3, 15), PCT012H086 = c(6, 3, 12), PCT012H087 = c(4, 7, 10 - ), PCT012H187 = c(10, 7, 18), PCT012H188 = c(9, 8, 15), PCT012H189 = c(4, - 7, 16), PCT012H190 = c(9, 4, 24), PCT012H191 = c(7, 3, 15 - ), PCT012L083 = c(3, 3, 10), PCT012M083 = c(1, 1, 0), PCT012L084 = c(3, - 2, 19), PCT012M084 = c(0, 2, 0), PCT012L085 = c(1, 2, 6), - PCT012M085 = c(0, 0, 0), PCT012L086 = c(1, 0, 6), PCT012M086 = c(0, - 0, 0), PCT012L087 = c(3, 1, 12), PCT012M087 = c(0, 0, 0), - PCT012L187 = c(7, 1, 18), PCT012M187 = c(0, 0, 0), PCT012L188 = c(5, - 2, 18), PCT012M188 = c(0, 0, 0), PCT012L189 = c(6, 0, 14), - PCT012M189 = c(1, 0, 0), PCT012L190 = c(4, 6, 7), PCT012M190 = c(0, - 0, 0), PCT012L191 = c(4, 1, 15), PCT012M191 = c(0, 0, 0), - PCT012K083 = c(3, 4, 3), PCT012N083 = c(0, 0, 1), PCT012O083 = c(2, - 2, 6), PCT012K084 = c(2, 1, 1), PCT012N084 = c(0, 0, 1), - PCT012O084 = c(3, 0, 9), PCT012K085 = c(2, 1, 0), PCT012N085 = c(0, - 0, 0), PCT012O085 = c(0, 3, 3), PCT012K086 = c(0, 0, 0), - PCT012N086 = c(0, 0, 0), PCT012O086 = c(1, 4, 4), PCT012K087 = c(1, - 2, 1), PCT012N087 = c(0, 0, 0), PCT012O087 = c(0, 0, 1), - PCT012K187 = c(3, 3, 1), PCT012N187 = c(0, 0, 0), PCT012O187 = c(1, - 2, 4), PCT012K188 = c(6, 3, 1), PCT012N188 = c(0, 0, 0), - PCT012O188 = c(2, 1, 4), PCT012K189 = c(4, 3, 0), PCT012N189 = c(1, - 0, 0), PCT012O189 = c(2, 3, 6), PCT012K190 = c(1, 0, 0), - PCT012N190 = c(0, 0, 1), PCT012O190 = c(1, 4, 5), PCT012K191 = c(1, - 0, 2), PCT012N191 = c(0, 0, 1), PCT012O191 = c(1, 1, 5), - PCT012I088 = c(122, 282, 483), PCT012I089 = c(102, 186, 452 - ), PCT012I090 = c(80, 161, 353), PCT012I091 = c(77, 154, - 321), PCT012I092 = c(63, 117, 259), PCT012I093 = c(51, 104, - 198), PCT012I094 = c(36, 61, 159), PCT012I095 = c(36, 54, - 110), PCT012I096 = c(17, 41, 80), PCT012I097 = c(9, 15, 59 - ), PCT012I098 = c(8, 16, 45), PCT012I099 = c(5, 10, 29), - PCT012I100 = c(7, 7, 18), PCT012I101 = c(3, 6, 14), PCT012I102 = c(2, - 1, 5), PCT012I103 = c(2, 6, 5), PCT012I104 = c(0, 1, 1), - PCT012I105 = c(0, 0, 0), PCT012I192 = c(213, 384, 806), PCT012I193 = c(215, - 380, 792), PCT012I194 = c(160, 336, 687), PCT012I195 = c(164, - 278, 606), PCT012I196 = c(135, 234, 518), PCT012I197 = c(93, - 189, 404), PCT012I198 = c(77, 181, 358), PCT012I199 = c(69, - 120, 291), PCT012I200 = c(57, 115, 224), PCT012I201 = c(42, - 89, 165), PCT012I202 = c(39, 54, 148), PCT012I203 = c(24, - 54, 116), PCT012I204 = c(27, 33, 75), PCT012I205 = c(13, - 25, 49), PCT012I206 = c(18, 14, 25), PCT012I207 = c(6, 32, - 53), PCT012I208 = c(0, 0, 1), PCT012I209 = c(0, 0, 0), PCT012J088 = c(23, - 29, 59), PCT012J089 = c(19, 19, 47), PCT012J090 = c(17, 17, - 32), PCT012J091 = c(12, 7, 34), PCT012J092 = c(7, 8, 24), - PCT012J093 = c(11, 7, 22), PCT012J094 = c(10, 3, 23), PCT012J095 = c(2, - 8, 13), PCT012J096 = c(3, 4, 16), PCT012J097 = c(0, 6, 3), - PCT012J098 = c(0, 2, 4), PCT012J099 = c(0, 0, 5), PCT012J100 = c(0, - 0, 4), PCT012J101 = c(2, 0, 2), PCT012J102 = c(0, 2, 0), - PCT012J103 = c(1, 2, 4), PCT012J104 = c(0, 0, 0), PCT012J105 = c(0, - 0, 0), PCT012J192 = c(42, 31, 96), PCT012J193 = c(31, 38, - 98), PCT012J194 = c(20, 25, 106), PCT012J195 = c(33, 18, - 57), PCT012J196 = c(19, 22, 69), PCT012J197 = c(21, 11, 54 - ), PCT012J198 = c(13, 12, 36), PCT012J199 = c(11, 5, 34), - PCT012J200 = c(13, 14, 28), PCT012J201 = c(5, 7, 23), PCT012J202 = c(10, - 5, 23), PCT012J203 = c(4, 4, 20), PCT012J204 = c(4, 6, 13 - ), PCT012J205 = c(3, 2, 7), PCT012J206 = c(2, 0, 9), PCT012J207 = c(4, - 4, 16), PCT012J208 = c(0, 0, 2), PCT012J209 = c(0, 0, 0), - PCT012H088 = c(3, 2, 15), PCT012H089 = c(5, 4, 5), PCT012H090 = c(3, - 3, 8), PCT012H091 = c(4, 0, 5), PCT012H092 = c(2, 0, 4), - PCT012H093 = c(1, 1, 4), PCT012H094 = c(0, 2, 3), PCT012H095 = c(1, - 1, 4), PCT012H096 = c(0, 0, 0), PCT012H097 = c(0, 0, 1), - PCT012H098 = c(0, 0, 0), PCT012H099 = c(0, 0, 1), PCT012H100 = c(0, - 0, 0), PCT012H101 = c(0, 0, 0), PCT012H102 = c(0, 0, 0), - PCT012H103 = c(0, 0, 1), PCT012H104 = c(0, 0, 0), PCT012H105 = c(0, - 1, 0), PCT012H192 = c(3, 3, 15), PCT012H193 = c(7, 4, 10), - PCT012H194 = c(1, 0, 11), PCT012H195 = c(2, 5, 14), PCT012H196 = c(1, - 6, 8), PCT012H197 = c(1, 2, 9), PCT012H198 = c(0, 2, 2), - PCT012H199 = c(4, 0, 4), PCT012H200 = c(2, 2, 2), PCT012H201 = c(1, - 1, 3), PCT012H202 = c(1, 0, 1), PCT012H203 = c(2, 1, 0), - PCT012H204 = c(1, 0, 1), PCT012H205 = c(2, 0, 1), PCT012H206 = c(2, - 1, 0), PCT012H207 = c(0, 0, 2), PCT012H208 = c(0, 0, 0), - PCT012H209 = c(0, 0, 0), PCT012L088 = c(1, 0, 9), PCT012M088 = c(0, - 0, 0), PCT012L089 = c(1, 1, 7), PCT012M089 = c(1, 0, 0), - PCT012L090 = c(1, 0, 5), PCT012M090 = c(0, 0, 0), PCT012L091 = c(1, - 0, 6), PCT012M091 = c(0, 0, 0), PCT012L092 = c(0, 1, 2), - PCT012M092 = c(0, 0, 0), PCT012L093 = c(0, 0, 3), PCT012M093 = c(0, - 0, 0), PCT012L094 = c(0, 0, 3), PCT012M094 = c(0, 0, 0), - PCT012L095 = c(0, 0, 1), PCT012M095 = c(0, 0, 0), PCT012L096 = c(0, - 0, 1), PCT012M096 = c(0, 0, 0), PCT012L097 = c(0, 0, 1), - PCT012M097 = c(0, 0, 0), PCT012L098 = c(0, 0, 1), PCT012M098 = c(0, - 0, 0), PCT012L099 = c(1, 0, 1), PCT012M099 = c(0, 0, 0), - PCT012L100 = c(0, 0, 0), PCT012M100 = c(0, 0, 0), PCT012L101 = c(0, - 0, 0), PCT012M101 = c(0, 0, 0), PCT012L102 = c(0, 0, 0), - PCT012M102 = c(0, 0, 0), PCT012L103 = c(0, 0, 0), PCT012M103 = c(0, - 0, 0), PCT012L104 = c(0, 0, 0), PCT012M104 = c(0, 0, 0), - PCT012L105 = c(0, 0, 0), PCT012M105 = c(0, 0, 0), PCT012L192 = c(2, - 0, 16), PCT012M192 = c(0, 0, 0), PCT012L193 = c(5, 0, 10), - PCT012M193 = c(0, 0, 0), PCT012L194 = c(4, 2, 2), PCT012M194 = c(0, - 0, 0), PCT012L195 = c(0, 1, 11), PCT012M195 = c(0, 0, 0), - PCT012L196 = c(2, 0, 7), PCT012M196 = c(0, 1, 1), PCT012L197 = c(0, - 0, 2), PCT012M197 = c(0, 0, 0), PCT012L198 = c(2, 0, 6), - PCT012M198 = c(0, 0, 0), PCT012L199 = c(1, 0, 1), PCT012M199 = c(0, - 0, 0), PCT012L200 = c(0, 0, 2), PCT012M200 = c(0, 0, 0), - PCT012L201 = c(0, 0, 0), PCT012M201 = c(0, 0, 0), PCT012L202 = c(0, - 0, 2), PCT012M202 = c(0, 0, 0), PCT012L203 = c(0, 0, 1), - PCT012M203 = c(0, 0, 0), PCT012L204 = c(0, 1, 0), PCT012M204 = c(0, - 0, 0), PCT012L205 = c(0, 0, 1), PCT012M205 = c(0, 0, 0), - PCT012L206 = c(0, 0, 0), PCT012M206 = c(0, 0, 0), PCT012L207 = c(0, - 0, 0), PCT012M207 = c(0, 0, 0), PCT012L208 = c(0, 0, 0), - PCT012M208 = c(0, 0, 0), PCT012L209 = c(0, 0, 0), PCT012M209 = c(0, - 0, 0), PCT012K088 = c(1, 0, 1), PCT012N088 = c(0, 0, 0), - PCT012O088 = c(0, 2, 4), PCT012K089 = c(0, 1, 0), PCT012N089 = c(0, - 0, 0), PCT012O089 = c(1, 0, 4), PCT012K090 = c(0, 0, 1), - PCT012N090 = c(0, 0, 0), PCT012O090 = c(2, 1, 0), PCT012K091 = c(1, - 0, 0), PCT012N091 = c(0, 0, 0), PCT012O091 = c(0, 1, 1), - PCT012K092 = c(0, 0, 0), PCT012N092 = c(0, 0, 0), PCT012O092 = c(0, - 0, 1), PCT012K093 = c(0, 0, 0), PCT012N093 = c(0, 0, 0), - PCT012O093 = c(1, 1, 0), PCT012K094 = c(0, 1, 0), PCT012N094 = c(0, - 0, 0), PCT012O094 = c(0, 1, 2), PCT012K095 = c(0, 0, 0), - PCT012N095 = c(0, 0, 0), PCT012O095 = c(0, 0, 1), PCT012K096 = c(0, - 1, 0), PCT012N096 = c(0, 0, 0), PCT012O096 = c(0, 0, 1), - PCT012K097 = c(1, 0, 0), PCT012N097 = c(0, 0, 0), PCT012O097 = c(0, - 0, 1), PCT012K098 = c(0, 0, 0), PCT012N098 = c(0, 0, 0), - PCT012O098 = c(1, 0, 0), PCT012K099 = c(0, 0, 0), PCT012N099 = c(0, - 0, 0), PCT012O099 = c(0, 0, 0), PCT012K100 = c(0, 0, 1), - PCT012N100 = c(0, 0, 0), PCT012O100 = c(0, 0, 0), PCT012K101 = c(0, - 0, 0), PCT012N101 = c(0, 0, 0), PCT012O101 = c(0, 0, 0), - PCT012K102 = c(0, 1, 0), PCT012N102 = c(0, 0, 0), PCT012O102 = c(0, - 0, 0), PCT012K103 = c(0, 0, 0), PCT012N103 = c(0, 0, 0), - PCT012O103 = c(0, 0, 0), PCT012K104 = c(0, 0, 0), PCT012N104 = c(0, - 0, 0), PCT012O104 = c(0, 0, 0), PCT012K105 = c(0, 0, 0), - PCT012N105 = c(0, 0, 0), PCT012O105 = c(0, 0, 0), PCT012K192 = c(3, - 1, 4), PCT012N192 = c(0, 0, 0), PCT012O192 = c(1, 2, 5), - PCT012K193 = c(1, 3, 1), PCT012N193 = c(0, 0, 0), PCT012O193 = c(1, - 2, 7), PCT012K194 = c(1, 1, 0), PCT012N194 = c(0, 0, 1), - PCT012O194 = c(0, 3, 6), PCT012K195 = c(0, 1, 0), PCT012N195 = c(0, - 0, 0), PCT012O195 = c(0, 1, 5), PCT012K196 = c(0, 2, 0), - PCT012N196 = c(0, 0, 0), PCT012O196 = c(0, 0, 7), PCT012K197 = c(0, - 0, 1), PCT012N197 = c(0, 0, 0), PCT012O197 = c(0, 2, 2), - PCT012K198 = c(0, 2, 3), PCT012N198 = c(1, 0, 0), PCT012O198 = c(0, - 0, 3), PCT012K199 = c(0, 1, 0), PCT012N199 = c(0, 1, 0), - PCT012O199 = c(0, 1, 0), PCT012K200 = c(0, 1, 0), PCT012N200 = c(0, - 0, 0), PCT012O200 = c(1, 1, 0), PCT012K201 = c(0, 0, 0), - PCT012N201 = c(0, 0, 0), PCT012O201 = c(1, 0, 0), PCT012K202 = c(0, - 0, 0), PCT012N202 = c(0, 0, 0), PCT012O202 = c(0, 1, 1), - PCT012K203 = c(0, 2, 1), PCT012N203 = c(0, 0, 0), PCT012O203 = c(0, - 1, 0), PCT012K204 = c(0, 0, 0), PCT012N204 = c(0, 0, 0), - PCT012O204 = c(0, 0, 0), PCT012K205 = c(0, 0, 0), PCT012N205 = c(0, - 0, 0), PCT012O205 = c(0, 0, 0), PCT012K206 = c(0, 0, 0), - PCT012N206 = c(0, 0, 0), PCT012O206 = c(0, 0, 1), PCT012K207 = c(0, - 0, 0), PCT012N207 = c(0, 0, 0), PCT012O207 = c(0, 0, 2), - PCT012K208 = c(0, 0, 0), PCT012N208 = c(0, 0, 0), PCT012O208 = c(0, - 0, 0), PCT012K209 = c(0, 0, 0), PCT012N209 = c(0, 0, 0), - PCT012O209 = c(0, 0, 0), r_whi = c(0.180469772578534, 0.253982943390052, - 0.565547284031414), r_bla = c(0.20243920720412, 0.131404525061301, - 0.666156267734578), r_his = c(0.127640977315251, 0.23154559484301, - 0.64081342784174), r_asi = c(0.117004133678974, 0.0690814825194423, - 0.813914383801583), r_oth = c(0.261609154774003, 0.205452215791101, - 0.532938629434896), r_1_whi = c(0.0103484947643979, 0.0105427846858639, - 0.0260587096422339), r_1_bla = c(0.0146427385936546, 0.0104292704864494, - 0.0473814393249885), r_1_his = c(0.0166072574807774, 0.0334740033596919, - 0.0732713292634627), r_1_asi = c(0.00662089259440902, 0.00378336719680516, - 0.0621102781475513), r_1_oth = c(0.0414439093359254, 0.0350815181372332, - 0.0870852295320991), r_2_whi = c(0.0107285531195462, 0.0110932727966841, - 0.028253844895288), r_2_bla = c(0.0154029831568352, 0.00988317932134788, - 0.0493570044222677), r_2_his = c(0.0140533453517434, 0.0259078679613772, - 0.0676718427773453), r_2_asi = c(0.00802213970433686, 0.00521964548448119, - 0.0575562250402859), r_2_oth = c(0.0378208810144479, 0.0298678920160827, - 0.0763486943843061), r_3_whi = c(0.0107984293193717, 0.0116386480148342, - 0.0309943553664921), r_3_bla = c(0.0160615048559283, 0.00978681029221231, - 0.0528851816556199), r_3_his = c(0.0112126302563472, 0.0185875636770872, - 0.058330260444408), r_3_asi = c(0.00693617319414279, 0.00483430252925103, - 0.0459258740278848), r_3_oth = c(0.033756019970839, 0.0235496840896037, - 0.0615914814651173), r_4_whi = c(0.00678651287085515, 0.00793350512652705, - 0.0207890215968586), r_4_bla = c(0.00990459466115579, 0.00679401655405767, - 0.0347892195179407), r_4_his = c(0.00701984403381544, 0.0101473620955737, - 0.0336242334849292), r_4_asi = c(0.00399355426329433, 0.00304771246409304, - 0.0267287886218735), r_4_oth = c(0.016082711085583, 0.013254981663942, - 0.0329607210710025), r_5_whi = c(0.00460160340314136, 0.00465614092495637, - 0.0187779504799302), r_5_bla = c(0.0113394224282854, 0.00433660631110064, - 0.0224914606332516), r_5_his = c(0.00520342524685541, 0.00641892353286625, - 0.0257986096884773), r_5_asi = c(0.00266236950886289, 0.00157640299866882, - 0.0248721362012191), r_5_oth = c(0.011973666769761, 0.00600892502098705, - 0.0212521539345204), r_6_whi = c(0.00225137707242583, 0.00229909740401396, - 0.00987981293630017), r_6_bla = c(0.005375250291784, 0.00189525757299954, - 0.0105845317000567), r_6_his = c(0.00243099657202169, 0.00357820843747012, - 0.0131792791685445), r_6_asi = c(0.00147130946542423, 0.000735654732712114, - 0.0136971904995446), r_6_oth = c(0.00428577740467459, 0.00207661379401758, - 0.0104272522423011), r_7_whi = c(0.00251043030104712, 0.00198721095113438, - 0.00958496945898778), r_7_bla = c(0.00455611354413166, 0.00177211936910409, - 0.00946558019509375), r_7_his = c(0.0025675694131465, 0.00422010079075675, - 0.012482757678808), r_7_asi = c(0.00112099768794227, 0.000560498843971134, - 0.0114201639459119), r_7_oth = c(0.00477179339901913, 0.00273936287721469, - 0.00901338753148058), r_8_whi = c(0.007042157504363, 0.00668255071989529, - 0.022169502617801), r_8_bla = c(0.00990459466115579, 0.00549303466072748, - 0.0279577261192192), r_8_his = c(0.00767539367121454, 0.0150912989442919, - 0.034867046339165), r_8_asi = c(0.0050795207734884, 0.00238212008687732, - 0.0347859595039585), r_8_oth = c(0.0112225511421376, 0.00600892502098705, - 0.0211637873900941), r_9_whi = c(0.0113455088350785, 0.0116420566099476, - 0.034428514943281), r_9_bla = c(0.0128920345643585, 0.00869462796200919, - 0.0447634140334722), r_9_his = c(0.0117042924843966, 0.0273691973614127, - 0.0615533794949536), r_9_asi = c(0.00760176557135851, 0.00413367897428712, - 0.0875429131927415), r_9_oth = c(0.0156850616356647, 0.0100296027923828, - 0.0325630716210843), r_10_whi = c(0.010547897578534, 0.0108768270069808, - 0.0317902623254799), r_10_bla = c(0.0112751764088617, 0.0076399224764699, - 0.0449561520917433), r_10_his = c(0.00971032900397427, 0.0233129839800057, - 0.0571830485789596), r_10_asi = c(0.00826735794857423, 0.00490436488474743, - 0.0874027884817488), r_10_oth = c(0.0124154994918924, 0.00865992135377546, - 0.0312375734546901), r_11_whi = c(0.0108648969240838, 0.0123408186082024, - 0.0336939626963351), r_11_bla = c(0.0121478515060338, 0.00742576907839085, - 0.0470173785482541), r_11_his = c(0.0085904317067508, 0.0194479725761735, - 0.0498490870105571), r_11_asi = c(0.00991382330273944, 0.00592026903944511, - 0.0794507111329083), r_11_oth = c(0.0118411169531215, 0.00790880572615208, - 0.028056377855344), r_12_whi = c(0.0123732002617801, 0.0148563618019197, - 0.0389994409904014), r_12_bla = c(0.0130419419430138, 0.00933173432129434, - 0.0522213061215749), r_12_his = c(0.0068969284768031, 0.0138348288059437, - 0.0439491402739651), r_12_asi = c(0.0104743221467106, 0.0059903313949415, - 0.0613746234148392), r_12_oth = c(0.0108690849644325, 0.00773207263729952, - 0.0258913975169001), r_13_whi = c(0.0142956479057592, 0.0187694289921466, - 0.0460910231239093), r_13_bla = c(0.014107355098457, 0.00997954835048345, - 0.0530243813643713), r_13_his = c(0.00647355266931618, 0.00906843665068764, - 0.0338700645989539), r_13_asi = c(0.0100539480137322, 0.00592026903944511, - 0.0544034190429482), r_13_oth = c(0.0107807184200062, 0.0114876507754164, - 0.0254937480669818), r_14_whi = c(0.0135491655759162, 0.0200459478621291, - 0.0477476003490401), r_14_bla = c(0.0126243428167596, 0.00984034864173207, - 0.0476223618978274), r_14_his = c(0.00498490870105571, 0.00643258081697874, - 0.0242143647314295), r_14_asi = c(0.00802213970433686, 0.00518461430673299, - 0.0443845022069642), r_14_oth = c(0.0109132682366456, 0.00888083771484116, - 0.021119604117881), r_15_whi = c(0.0115568417321117, 0.0209338868891798, - 0.0415763388961606), r_15_bla = c(0.0109593001466951, 0.00799327558330032, - 0.0368076152948357), r_15_his = c(0.0033050627552205, 0.00465713388235616, - 0.0178227557667882), r_15_asi = c(0.00655083023891263, 0.00364324248581237, - 0.0364674560358719), r_15_oth = c(0.00755533954844696, 0.0078204391817258, - 0.0165245438077144), r_16_whi = c(0.00452491001308901, 0.00917082515270506, - 0.0147200179973822), r_16_bla = c(0.00390829951494255, 0.00267691747598805, - 0.0132025569915731), r_16_his = c(0.00140670026358558, 0.00151595853648543, - 0.00549022821321752), r_16_asi = c(0.00266236950886289, 0.00178659006515799, - 0.0121207875008758), r_16_oth = c(0.00269517960500155, 0.00269517960500155, - 0.00477179339901913), r_17_whi = c(0.00687343204624782, 0.0147830770069808, - 0.020799247382199), r_17_bla = c(0.00532171194226424, 0.00354959257316015, - 0.0164898116520864), r_17_his = c(0.00177544693462258, 0.00189836249163491, - 0.0068969284768031), r_17_asi = c(0.00360821130806418, 0.00203180830939536, - 0.0178308694738317), r_17_oth = c(0.00393231122696947, 0.00432996067688773, - 0.00645075774311846), r_18_whi = c(0.00403918520942408, 0.0088964332460733, - 0.011166557591623), r_18_bla = c(0.00278934800997955, 0.00196485742737523, - 0.0087802893212408), r_18_his = c(0.000655549637399107, 0.0010242963084361, - 0.00342797831223283), r_18_asi = c(0.0025923071533665, 0.00122609122118686, - 0.00910810621453093), r_18_oth = c(0.00203243052180444, 0.00189988070516502, - 0.00304864578270667), r_19_whi = c(0.00547590804973822, 0.0126595222513089, - 0.0152568717277487), r_19_bla = c(0.00387082267027872, 0.0028000556798835, - 0.0113608377680933), r_19_his = c(0.00127012742246077, 0.00135207112713566, - 0.00435667363188156), r_19_asi = c(0.00318783717508583, 0.00164646535416521, - 0.013241785188818), r_19_oth = c(0.00287191269385411, 0.00371139486590377, - 0.00446251049352715), r_20_whi = c(0.00717168411867365, 0.016080047447644, - 0.0191648260253054), r_20_bla = c(0.00503260485485753, 0.00315340878671392, - 0.014123416603313), r_20_his = c(0.00162521680938529, 0.00187104792340995, - 0.00529902623564278), r_20_asi = c(0.00353814895256779, 0.00217193302038815, - 0.0167449029636376), r_20_oth = c(0.00384394468254319, 0.00508107630451111, - 0.00450669376574029), r_21_whi = c(0.00569405813699825, 0.011773287521815, - 0.0165981539048866), r_21_bla = c(0.00362454626248782, 0.00248953325266889, - 0.00955124155432536), r_21_his = c(0.000969667171986179, - 0.00107892544488603, 0.00349626473279524), r_21_asi = c(0.00248721362012191, - 0.00140124710992784, 0.00879282561479717), r_21_oth = c(0.00238589669950957, - 0.00326956214377237, 0.00331374541598551), r_22_whi = c(0.00372900305410122, - 0.0079471395069808, 0.0135270097076789), r_22_bla = c(0.00182565771862385, - 0.00177211936910409, 0.00606054116563695), r_22_his = c(0.000833094330861365, - 0.000696521489736551, 0.0023490528673468), r_22_asi = c(0.00136621593217964, - 0.000735654732712114, 0.00437889721852449), r_22_oth = c(0.00163478107188618, - 0.00163478107188618, 0.00265099633278841), r_23_whi = c(0.00336087478184991, - 0.00637407286212914, 0.0134792893760908), r_23_bla = c(0.00183101155357583, - 0.0017025195147284, 0.00526281975779251), r_23_his = c(0.000669206921511588, - 0.000559948648611737, 0.00183007607107251), r_23_asi = c(0.00077068591046031, - 0.000245218244237371, 0.00357318013031598), r_23_oth = c(0.000795298899836522, - 0.00172314761631246, 0.00300446251049353)), class = c("tbl_df", - "tbl", "data.frame"), row.names = c(NA, -3L)) - diff --git a/tests/testthat/data/census_test_nj_block_2020.rds b/tests/testthat/data/census_test_nj_block_2020.rds index 3331f2c..3c78fed 100644 Binary files a/tests/testthat/data/census_test_nj_block_2020.rds and b/tests/testthat/data/census_test_nj_block_2020.rds differ diff --git a/tests/testthat/data/new_census_table_NJ_2020.rds b/tests/testthat/data/new_census_table_NJ_2020.rds deleted file mode 100644 index 570dd53..0000000 Binary files a/tests/testthat/data/new_census_table_NJ_2020.rds and /dev/null differ diff --git a/tests/testthat/test-census_geo_api.R b/tests/testthat/test-census_geo_api.R deleted file mode 100644 index 8588bf6..0000000 --- a/tests/testthat/test-census_geo_api.R +++ /dev/null @@ -1,72 +0,0 @@ -skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) - -test_that("snapshot", { - # TODO: Test that sub-geographies sum to match pooled geographies (e.g. blocks sum to block groups, sum to tracts, sum to counties) - - # These snapshots were generated using the calculations in v2.0.0 - # and verified that the calculations resulted in the same numbers for PR #120. - expect_snapshot_value( - census_geo_api(state = "DE", geo = "county", year = "2020"), - style = "deparse" - ) - expect_snapshot_value( - census_geo_api(state = "DE", geo = "county", year = "2010"), - style = "deparse" - ) - expect_snapshot_value( - census_geo_api(state = "DE", geo = "county", year = "2010", sex = TRUE), - style = "deparse" - ) - expect_snapshot_value( - census_geo_api(state = "DE", geo = "county", year = "2010", age = TRUE), - style = "deparse" - ) -}) - -expect_subset_sums_equal_overall_total <- function(data) { - `%>%` <- dplyr::`%>%` - - sums <- data %>% - dplyr::select(-dplyr::starts_with("r_")) %>% - tidyr::pivot_longer(dplyr::starts_with("P")) %>% - dplyr::mutate( - subset = dplyr::case_when( - grepl("001", name) ~ "overall", - .default = "subset" - ), - name = sub("_.+", "", name) - ) %>% - dplyr::summarise( - value = sum(value), - .by = -value - ) %>% - dplyr::summarize( - are_equal = length(unique(value)) <= 1, - .by = c(-subset, -value) - ) - - expect_true(all(sums$are_equal)) -} - -test_that("sums", { - expect_subset_sums_equal_overall_total( - census_geo_api(state = "DE", geo = "county", year = "2020", sex = TRUE) - ) - expect_subset_sums_equal_overall_total( - census_geo_api(state = "DE", geo = "county", year = "2020", age = TRUE) - ) - expect_subset_sums_equal_overall_total( - census_geo_api(state = "DE", geo = "county", year = "2020", age = TRUE, sex = TRUE) - ) - expect_subset_sums_equal_overall_total( - census_geo_api(state = "DE", geo = "county", year = "2010", sex = TRUE) - ) - expect_subset_sums_equal_overall_total( - census_geo_api(state = "DE", geo = "county", year = "2010", age = TRUE) - ) - expect_subset_sums_equal_overall_total( - census_geo_api(state = "DE", geo = "county", year = "2010", age = TRUE, sex = TRUE) - ) -}) - -# TODO: Test that all variables sum to total population of geography diff --git a/tests/testthat/test-census_geo_api_names.R b/tests/testthat/test-census_geo_api_names.R deleted file mode 100644 index 85d6c0e..0000000 --- a/tests/testthat/test-census_geo_api_names.R +++ /dev/null @@ -1,412 +0,0 @@ -test_that("census_geo_api_names() for 2020", { - # TODO: Verify against table names here: https://api.census.gov/data/2020/dec/dhc/variables.html - - expect_equal( - census_geo_api_names("2020"), - list( - r_whi = "P12I_001N", - r_bla = "P12J_001N", - r_his = "P12H_001N", - r_asi = c("P12L_001N", "P12M_001N"), - r_oth = c("P12K_001N", "P12N_001N", "P12O_001N") - ) - ) - - expect_equal( - census_geo_api_names("2020", age = FALSE, sex = TRUE), - c( - census_geo_api_names("2020", age = FALSE, sex = FALSE), - list( - r_mal_whi = "P12I_002N", - r_mal_bla = "P12J_002N", - r_mal_his = "P12H_002N", - r_mal_asi = c("P12L_002N", "P12M_002N"), - r_mal_oth = c("P12K_002N", "P12N_002N", "P12O_002N"), - r_fem_whi = "P12I_026N", - r_fem_bla = "P12J_026N", - r_fem_his = "P12H_026N", - r_fem_asi = c("P12L_026N", "P12M_026N"), - r_fem_oth = c("P12K_026N", "P12N_026N", "P12O_026N") - ) - ) - ) - - expect_equal( - census_geo_api_names("2020", age = TRUE, sex = FALSE), - c( - census_geo_api_names("2020", age = FALSE, sex = FALSE), - list( - r_1_whi = c("P12I_003N", "P12I_027N"), - r_1_bla = c("P12J_003N", "P12J_027N"), - r_1_his = c("P12H_003N", "P12H_027N"), - r_1_asi = c("P12L_003N", "P12M_003N", "P12L_027N", "P12M_027N"), - r_1_oth = c("P12K_003N", "P12N_003N", "P12O_003N", "P12K_027N", "P12N_027N", "P12O_027N"), - r_2_whi = c("P12I_004N", "P12I_028N"), - r_2_bla = c("P12J_004N", "P12J_028N"), - r_2_his = c("P12H_004N", "P12H_028N"), - r_2_asi = c("P12L_004N", "P12M_004N", "P12L_028N", "P12M_028N"), - r_2_oth = c("P12K_004N", "P12N_004N", "P12O_004N", "P12K_028N", "P12N_028N", "P12O_028N"), - r_3_whi = c("P12I_005N", "P12I_029N"), - r_3_bla = c("P12J_005N", "P12J_029N"), - r_3_his = c("P12H_005N", "P12H_029N"), - r_3_asi = c("P12L_005N", "P12M_005N", "P12L_029N", "P12M_029N"), - r_3_oth = c("P12K_005N", "P12N_005N", "P12O_005N", "P12K_029N", "P12N_029N", "P12O_029N"), - r_4_whi = c("P12I_006N", "P12I_030N"), - r_4_bla = c("P12J_006N", "P12J_030N"), - r_4_his = c("P12H_006N", "P12H_030N"), - r_4_asi = c("P12L_006N", "P12M_006N", "P12L_030N", "P12M_030N"), - r_4_oth = c("P12K_006N", "P12N_006N", "P12O_006N", "P12K_030N", "P12N_030N", "P12O_030N"), - r_5_whi = c("P12I_007N", "P12I_031N"), - r_5_bla = c("P12J_007N", "P12J_031N"), - r_5_his = c("P12H_007N", "P12H_031N"), - r_5_asi = c("P12L_007N", "P12M_007N", "P12L_031N", "P12M_031N"), - r_5_oth = c("P12K_007N", "P12N_007N", "P12O_007N", "P12K_031N", "P12N_031N", "P12O_031N"), - r_6_whi = c("P12I_008N", "P12I_032N"), - r_6_bla = c("P12J_008N", "P12J_032N"), - r_6_his = c("P12H_008N", "P12H_032N"), - r_6_asi = c("P12L_008N", "P12M_008N", "P12L_032N", "P12M_032N"), - r_6_oth = c("P12K_008N", "P12N_008N", "P12O_008N", "P12K_032N", "P12N_032N", "P12O_032N"), - r_7_whi = c("P12I_009N", "P12I_033N"), - r_7_bla = c("P12J_009N", "P12J_033N"), - r_7_his = c("P12H_009N", "P12H_033N"), - r_7_asi = c("P12L_009N", "P12M_009N", "P12L_033N", "P12M_033N"), - r_7_oth = c("P12K_009N", "P12N_009N", "P12O_009N", "P12K_033N", "P12N_033N", "P12O_033N"), - r_8_whi = c("P12I_010N", "P12I_034N"), - r_8_bla = c("P12J_010N", "P12J_034N"), - r_8_his = c("P12H_010N", "P12H_034N"), - r_8_asi = c("P12L_010N", "P12M_010N", "P12L_034N", "P12M_034N"), - r_8_oth = c("P12K_010N", "P12N_010N", "P12O_010N", "P12K_034N", "P12N_034N", "P12O_034N"), - r_9_whi = c("P12I_011N", "P12I_035N"), - r_9_bla = c("P12J_011N", "P12J_035N"), - r_9_his = c("P12H_011N", "P12H_035N"), - r_9_asi = c("P12L_011N", "P12M_011N", "P12L_035N", "P12M_035N"), - r_9_oth = c("P12K_011N", "P12N_011N", "P12O_011N", "P12K_035N", "P12N_035N", "P12O_035N"), - r_10_whi = c("P12I_012N", "P12I_036N"), - r_10_bla = c("P12J_012N", "P12J_036N"), - r_10_his = c("P12H_012N", "P12H_036N"), - r_10_asi = c("P12L_012N", "P12M_012N", "P12L_036N", "P12M_036N"), - r_10_oth = c("P12K_012N", "P12N_012N", "P12O_012N", "P12K_036N", "P12N_036N", "P12O_036N"), - r_11_whi = c("P12I_013N", "P12I_037N"), - r_11_bla = c("P12J_013N", "P12J_037N"), - r_11_his = c("P12H_013N", "P12H_037N"), - r_11_asi = c("P12L_013N", "P12M_013N", "P12L_037N", "P12M_037N"), - r_11_oth = c("P12K_013N", "P12N_013N", "P12O_013N", "P12K_037N", "P12N_037N", "P12O_037N"), - r_12_whi = c("P12I_014N", "P12I_038N"), - r_12_bla = c("P12J_014N", "P12J_038N"), - r_12_his = c("P12H_014N", "P12H_038N"), - r_12_asi = c("P12L_014N", "P12M_014N", "P12L_038N", "P12M_038N"), - r_12_oth = c("P12K_014N", "P12N_014N", "P12O_014N", "P12K_038N", "P12N_038N", "P12O_038N"), - r_13_whi = c("P12I_015N", "P12I_039N"), - r_13_bla = c("P12J_015N", "P12J_039N"), - r_13_his = c("P12H_015N", "P12H_039N"), - r_13_asi = c("P12L_015N", "P12M_015N", "P12L_039N", "P12M_039N"), - r_13_oth = c("P12K_015N", "P12N_015N", "P12O_015N", "P12K_039N", "P12N_039N", "P12O_039N"), - r_14_whi = c("P12I_016N", "P12I_040N"), - r_14_bla = c("P12J_016N", "P12J_040N"), - r_14_his = c("P12H_016N", "P12H_040N"), - r_14_asi = c("P12L_016N", "P12M_016N", "P12L_040N", "P12M_040N"), - r_14_oth = c("P12K_016N", "P12N_016N", "P12O_016N", "P12K_040N", "P12N_040N", "P12O_040N"), - r_15_whi = c("P12I_017N", "P12I_041N"), - r_15_bla = c("P12J_017N", "P12J_041N"), - r_15_his = c("P12H_017N", "P12H_041N"), - r_15_asi = c("P12L_017N", "P12M_017N", "P12L_041N", "P12M_041N"), - r_15_oth = c("P12K_017N", "P12N_017N", "P12O_017N", "P12K_041N", "P12N_041N", "P12O_041N"), - r_16_whi = c("P12I_018N", "P12I_042N"), - r_16_bla = c("P12J_018N", "P12J_042N"), - r_16_his = c("P12H_018N", "P12H_042N"), - r_16_asi = c("P12L_018N", "P12M_018N", "P12L_042N", "P12M_042N"), - r_16_oth = c("P12K_018N", "P12N_018N", "P12O_018N", "P12K_042N", "P12N_042N", "P12O_042N"), - r_17_whi = c("P12I_019N", "P12I_043N"), - r_17_bla = c("P12J_019N", "P12J_043N"), - r_17_his = c("P12H_019N", "P12H_043N"), - r_17_asi = c("P12L_019N", "P12M_019N", "P12L_043N", "P12M_043N"), - r_17_oth = c("P12K_019N", "P12N_019N", "P12O_019N", "P12K_043N", "P12N_043N", "P12O_043N"), - r_18_whi = c("P12I_020N", "P12I_044N"), - r_18_bla = c("P12J_020N", "P12J_044N"), - r_18_his = c("P12H_020N", "P12H_044N"), - r_18_asi = c("P12L_020N", "P12M_020N", "P12L_044N", "P12M_044N"), - r_18_oth = c("P12K_020N", "P12N_020N", "P12O_020N", "P12K_044N", "P12N_044N", "P12O_044N"), - r_19_whi = c("P12I_021N", "P12I_045N"), - r_19_bla = c("P12J_021N", "P12J_045N"), - r_19_his = c("P12H_021N", "P12H_045N"), - r_19_asi = c("P12L_021N", "P12M_021N", "P12L_045N", "P12M_045N"), - r_19_oth = c("P12K_021N", "P12N_021N", "P12O_021N", "P12K_045N", "P12N_045N", "P12O_045N"), - r_20_whi = c("P12I_022N", "P12I_046N"), - r_20_bla = c("P12J_022N", "P12J_046N"), - r_20_his = c("P12H_022N", "P12H_046N"), - r_20_asi = c("P12L_022N", "P12M_022N", "P12L_046N", "P12M_046N"), - r_20_oth = c("P12K_022N", "P12N_022N", "P12O_022N", "P12K_046N", "P12N_046N", "P12O_046N"), - r_21_whi = c("P12I_023N", "P12I_047N"), - r_21_bla = c("P12J_023N", "P12J_047N"), - r_21_his = c("P12H_023N", "P12H_047N"), - r_21_asi = c("P12L_023N", "P12M_023N", "P12L_047N", "P12M_047N"), - r_21_oth = c("P12K_023N", "P12N_023N", "P12O_023N", "P12K_047N", "P12N_047N", "P12O_047N"), - r_22_whi = c("P12I_024N", "P12I_048N"), - r_22_bla = c("P12J_024N", "P12J_048N"), - r_22_his = c("P12H_024N", "P12H_048N"), - r_22_asi = c("P12L_024N", "P12M_024N", "P12L_048N", "P12M_048N"), - r_22_oth = c("P12K_024N", "P12N_024N", "P12O_024N", "P12K_048N", "P12N_048N", "P12O_048N"), - r_23_whi = c("P12I_025N", "P12I_049N"), - r_23_bla = c("P12J_025N", "P12J_049N"), - r_23_his = c("P12H_025N", "P12H_049N"), - r_23_asi = c("P12L_025N", "P12M_025N", "P12L_049N", "P12M_049N"), - r_23_oth = c("P12K_025N", "P12N_025N", "P12O_025N", "P12K_049N", "P12N_049N", "P12O_049N") - ) - ) - ) - - expect_equal( - census_geo_api_names("2020", age = TRUE, sex = TRUE), - c( - census_geo_api_names("2020", age = FALSE, sex = FALSE), - list( - r_mal_1_whi = "P12I_003N", - r_mal_1_bla = "P12J_003N", - r_mal_1_his = "P12H_003N", - r_mal_1_asi = c("P12L_003N", "P12M_003N"), - r_mal_1_oth = c("P12K_003N", "P12N_003N", "P12O_003N"), - r_mal_2_whi = "P12I_004N", - r_mal_2_bla = "P12J_004N", - r_mal_2_his = "P12H_004N", - r_mal_2_asi = c("P12L_004N", "P12M_004N"), - r_mal_2_oth = c("P12K_004N", "P12N_004N", "P12O_004N"), - r_mal_3_whi = "P12I_005N", - r_mal_3_bla = "P12J_005N", - r_mal_3_his = "P12H_005N", - r_mal_3_asi = c("P12L_005N", "P12M_005N"), - r_mal_3_oth = c("P12K_005N", "P12N_005N", "P12O_005N"), - r_mal_4_whi = "P12I_006N", - r_mal_4_bla = "P12J_006N", - r_mal_4_his = "P12H_006N", - r_mal_4_asi = c("P12L_006N", "P12M_006N"), - r_mal_4_oth = c("P12K_006N", "P12N_006N", "P12O_006N"), - r_mal_5_whi = "P12I_007N", - r_mal_5_bla = "P12J_007N", - r_mal_5_his = "P12H_007N", - r_mal_5_asi = c("P12L_007N", "P12M_007N"), - r_mal_5_oth = c("P12K_007N", "P12N_007N", "P12O_007N"), - r_mal_6_whi = "P12I_008N", - r_mal_6_bla = "P12J_008N", - r_mal_6_his = "P12H_008N", - r_mal_6_asi = c("P12L_008N", "P12M_008N"), - r_mal_6_oth = c("P12K_008N", "P12N_008N", "P12O_008N"), - r_mal_7_whi = "P12I_009N", - r_mal_7_bla = "P12J_009N", - r_mal_7_his = "P12H_009N", - r_mal_7_asi = c("P12L_009N", "P12M_009N"), - r_mal_7_oth = c("P12K_009N", "P12N_009N", "P12O_009N"), - r_mal_8_whi = "P12I_010N", - r_mal_8_bla = "P12J_010N", - r_mal_8_his = "P12H_010N", - r_mal_8_asi = c("P12L_010N", "P12M_010N"), - r_mal_8_oth = c("P12K_010N", "P12N_010N", "P12O_010N"), - r_mal_9_whi = "P12I_011N", - r_mal_9_bla = "P12J_011N", - r_mal_9_his = "P12H_011N", - r_mal_9_asi = c("P12L_011N", "P12M_011N"), - r_mal_9_oth = c("P12K_011N", "P12N_011N", "P12O_011N"), - r_mal_10_whi = "P12I_012N", - r_mal_10_bla = "P12J_012N", - r_mal_10_his = "P12H_012N", - r_mal_10_asi = c("P12L_012N", "P12M_012N"), - r_mal_10_oth = c("P12K_012N", "P12N_012N", "P12O_012N"), - r_mal_11_whi = "P12I_013N", - r_mal_11_bla = "P12J_013N", - r_mal_11_his = "P12H_013N", - r_mal_11_asi = c("P12L_013N", "P12M_013N"), - r_mal_11_oth = c("P12K_013N", "P12N_013N", "P12O_013N"), - r_mal_12_whi = "P12I_014N", - r_mal_12_bla = "P12J_014N", - r_mal_12_his = "P12H_014N", - r_mal_12_asi = c("P12L_014N", "P12M_014N"), - r_mal_12_oth = c("P12K_014N", "P12N_014N", "P12O_014N"), - r_mal_13_whi = "P12I_015N", - r_mal_13_bla = "P12J_015N", - r_mal_13_his = "P12H_015N", - r_mal_13_asi = c("P12L_015N", "P12M_015N"), - r_mal_13_oth = c("P12K_015N", "P12N_015N", "P12O_015N"), - r_mal_14_whi = "P12I_016N", - r_mal_14_bla = "P12J_016N", - r_mal_14_his = "P12H_016N", - r_mal_14_asi = c("P12L_016N", "P12M_016N"), - r_mal_14_oth = c("P12K_016N", "P12N_016N", "P12O_016N"), - r_mal_15_whi = "P12I_017N", - r_mal_15_bla = "P12J_017N", - r_mal_15_his = "P12H_017N", - r_mal_15_asi = c("P12L_017N", "P12M_017N"), - r_mal_15_oth = c("P12K_017N", "P12N_017N", "P12O_017N"), - r_mal_16_whi = "P12I_018N", - r_mal_16_bla = "P12J_018N", - r_mal_16_his = "P12H_018N", - r_mal_16_asi = c("P12L_018N", "P12M_018N"), - r_mal_16_oth = c("P12K_018N", "P12N_018N", "P12O_018N"), - r_mal_17_whi = "P12I_019N", - r_mal_17_bla = "P12J_019N", - r_mal_17_his = "P12H_019N", - r_mal_17_asi = c("P12L_019N", "P12M_019N"), - r_mal_17_oth = c("P12K_019N", "P12N_019N", "P12O_019N"), - r_mal_18_whi = "P12I_020N", - r_mal_18_bla = "P12J_020N", - r_mal_18_his = "P12H_020N", - r_mal_18_asi = c("P12L_020N", "P12M_020N"), - r_mal_18_oth = c("P12K_020N", "P12N_020N", "P12O_020N"), - r_mal_19_whi = "P12I_021N", - r_mal_19_bla = "P12J_021N", - r_mal_19_his = "P12H_021N", - r_mal_19_asi = c("P12L_021N", "P12M_021N"), - r_mal_19_oth = c("P12K_021N", "P12N_021N", "P12O_021N"), - r_mal_20_whi = "P12I_022N", - r_mal_20_bla = "P12J_022N", - r_mal_20_his = "P12H_022N", - r_mal_20_asi = c("P12L_022N", "P12M_022N"), - r_mal_20_oth = c("P12K_022N", "P12N_022N", "P12O_022N"), - r_mal_21_whi = "P12I_023N", - r_mal_21_bla = "P12J_023N", - r_mal_21_his = "P12H_023N", - r_mal_21_asi = c("P12L_023N", "P12M_023N"), - r_mal_21_oth = c("P12K_023N", "P12N_023N", "P12O_023N"), - r_mal_22_whi = "P12I_024N", - r_mal_22_bla = "P12J_024N", - r_mal_22_his = "P12H_024N", - r_mal_22_asi = c("P12L_024N", "P12M_024N"), - r_mal_22_oth = c("P12K_024N", "P12N_024N", "P12O_024N"), - r_mal_23_whi = "P12I_025N", - r_mal_23_bla = "P12J_025N", - r_mal_23_his = "P12H_025N", - r_mal_23_asi = c("P12L_025N", "P12M_025N"), - r_mal_23_oth = c("P12K_025N", "P12N_025N", "P12O_025N"), - r_fem_1_whi = "P12I_027N", - r_fem_1_bla = "P12J_027N", - r_fem_1_his = "P12H_027N", - r_fem_1_asi = c("P12L_027N", "P12M_027N"), - r_fem_1_oth = c("P12K_027N", "P12N_027N", "P12O_027N"), - r_fem_2_whi = "P12I_028N", - r_fem_2_bla = "P12J_028N", - r_fem_2_his = "P12H_028N", - r_fem_2_asi = c("P12L_028N", "P12M_028N"), - r_fem_2_oth = c("P12K_028N", "P12N_028N", "P12O_028N"), - r_fem_3_whi = "P12I_029N", - r_fem_3_bla = "P12J_029N", - r_fem_3_his = "P12H_029N", - r_fem_3_asi = c("P12L_029N", "P12M_029N"), - r_fem_3_oth = c("P12K_029N", "P12N_029N", "P12O_029N"), - r_fem_4_whi = "P12I_030N", - r_fem_4_bla = "P12J_030N", - r_fem_4_his = "P12H_030N", - r_fem_4_asi = c("P12L_030N", "P12M_030N"), - r_fem_4_oth = c("P12K_030N", "P12N_030N", "P12O_030N"), - r_fem_5_whi = "P12I_031N", - r_fem_5_bla = "P12J_031N", - r_fem_5_his = "P12H_031N", - r_fem_5_asi = c("P12L_031N", "P12M_031N"), - r_fem_5_oth = c("P12K_031N", "P12N_031N", "P12O_031N"), - r_fem_6_whi = "P12I_032N", - r_fem_6_bla = "P12J_032N", - r_fem_6_his = "P12H_032N", - r_fem_6_asi = c("P12L_032N", "P12M_032N"), - r_fem_6_oth = c("P12K_032N", "P12N_032N", "P12O_032N"), - r_fem_7_whi = "P12I_033N", - r_fem_7_bla = "P12J_033N", - r_fem_7_his = "P12H_033N", - r_fem_7_asi = c("P12L_033N", "P12M_033N"), - r_fem_7_oth = c("P12K_033N", "P12N_033N", "P12O_033N"), - r_fem_8_whi = "P12I_034N", - r_fem_8_bla = "P12J_034N", - r_fem_8_his = "P12H_034N", - r_fem_8_asi = c("P12L_034N", "P12M_034N"), - r_fem_8_oth = c("P12K_034N", "P12N_034N", "P12O_034N"), - r_fem_9_whi = "P12I_035N", - r_fem_9_bla = "P12J_035N", - r_fem_9_his = "P12H_035N", - r_fem_9_asi = c("P12L_035N", "P12M_035N"), - r_fem_9_oth = c("P12K_035N", "P12N_035N", "P12O_035N"), - r_fem_10_whi = "P12I_036N", - r_fem_10_bla = "P12J_036N", - r_fem_10_his = "P12H_036N", - r_fem_10_asi = c("P12L_036N", "P12M_036N"), - r_fem_10_oth = c("P12K_036N", "P12N_036N", "P12O_036N"), - r_fem_11_whi = "P12I_037N", - r_fem_11_bla = "P12J_037N", - r_fem_11_his = "P12H_037N", - r_fem_11_asi = c("P12L_037N", "P12M_037N"), - r_fem_11_oth = c("P12K_037N", "P12N_037N", "P12O_037N"), - r_fem_12_whi = "P12I_038N", - r_fem_12_bla = "P12J_038N", - r_fem_12_his = "P12H_038N", - r_fem_12_asi = c("P12L_038N", "P12M_038N"), - r_fem_12_oth = c("P12K_038N", "P12N_038N", "P12O_038N"), - r_fem_13_whi = "P12I_039N", - r_fem_13_bla = "P12J_039N", - r_fem_13_his = "P12H_039N", - r_fem_13_asi = c("P12L_039N", "P12M_039N"), - r_fem_13_oth = c("P12K_039N", "P12N_039N", "P12O_039N"), - r_fem_14_whi = "P12I_040N", - r_fem_14_bla = "P12J_040N", - r_fem_14_his = "P12H_040N", - r_fem_14_asi = c("P12L_040N", "P12M_040N"), - r_fem_14_oth = c("P12K_040N", "P12N_040N", "P12O_040N"), - r_fem_15_whi = "P12I_041N", - r_fem_15_bla = "P12J_041N", - r_fem_15_his = "P12H_041N", - r_fem_15_asi = c("P12L_041N", "P12M_041N"), - r_fem_15_oth = c("P12K_041N", "P12N_041N", "P12O_041N"), - r_fem_16_whi = "P12I_042N", - r_fem_16_bla = "P12J_042N", - r_fem_16_his = "P12H_042N", - r_fem_16_asi = c("P12L_042N", "P12M_042N"), - r_fem_16_oth = c("P12K_042N", "P12N_042N", "P12O_042N"), - r_fem_17_whi = "P12I_043N", - r_fem_17_bla = "P12J_043N", - r_fem_17_his = "P12H_043N", - r_fem_17_asi = c("P12L_043N", "P12M_043N"), - r_fem_17_oth = c("P12K_043N", "P12N_043N", "P12O_043N"), - r_fem_18_whi = "P12I_044N", - r_fem_18_bla = "P12J_044N", - r_fem_18_his = "P12H_044N", - r_fem_18_asi = c("P12L_044N", "P12M_044N"), - r_fem_18_oth = c("P12K_044N", "P12N_044N", "P12O_044N"), - r_fem_19_whi = "P12I_045N", - r_fem_19_bla = "P12J_045N", - r_fem_19_his = "P12H_045N", - r_fem_19_asi = c("P12L_045N", "P12M_045N"), - r_fem_19_oth = c("P12K_045N", "P12N_045N", "P12O_045N"), - r_fem_20_whi = "P12I_046N", - r_fem_20_bla = "P12J_046N", - r_fem_20_his = "P12H_046N", - r_fem_20_asi = c("P12L_046N", "P12M_046N"), - r_fem_20_oth = c("P12K_046N", "P12N_046N", "P12O_046N"), - r_fem_21_whi = "P12I_047N", - r_fem_21_bla = "P12J_047N", - r_fem_21_his = "P12H_047N", - r_fem_21_asi = c("P12L_047N", "P12M_047N"), - r_fem_21_oth = c("P12K_047N", "P12N_047N", "P12O_047N"), - r_fem_22_whi = "P12I_048N", - r_fem_22_bla = "P12J_048N", - r_fem_22_his = "P12H_048N", - r_fem_22_asi = c("P12L_048N", "P12M_048N"), - r_fem_22_oth = c("P12K_048N", "P12N_048N", "P12O_048N"), - r_fem_23_whi = "P12I_049N", - r_fem_23_bla = "P12J_049N", - r_fem_23_his = "P12H_049N", - r_fem_23_asi = c("P12L_049N", "P12M_049N"), - r_fem_23_oth = c("P12K_049N", "P12N_049N", "P12O_049N") - ) - ) - ) -}) - -test_that("census_geo_api_url()", { - expect_equal( - census_geo_api_url("2020"), - "https://api.census.gov/data/2020/dec/dhc?" - ) - expect_equal( - census_geo_api_url("2010"), - "https://api.census.gov/data/2010/dec/sf1?" - ) - expect_equal( - census_geo_api_url("2000"), - "https://api.census.gov/data/2000/dec/sf1?" - ) - expect_error(census_geo_api_url("2023")) -}) diff --git a/tests/testthat/test-census_helper.R b/tests/testthat/test-census_helper.R deleted file mode 100644 index 4005862..0000000 --- a/tests/testthat/test-census_helper.R +++ /dev/null @@ -1,22 +0,0 @@ -# Note: must provide a valid U.S. Census API key for test cases that use U.S. Census statistics -# > usethis::edit_r_profile -# Sys.setenv("CENSUS_API_KEY" = "yourkey") -# For testing package coverage use: Sys.setenv("NOT_CRAN" = "TRUE") -options("piggyback.verbose" = FALSE) -options("wru_data_wd" = TRUE) - -skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) -test_that("census_helper old still returns predictions", { - data(voters) - census <- readRDS(test_path("data/census_test_nj_block_2020.rds")) - x <- census_helper( - voter.file = voters, - states = "NJ", - year = "2020", - census.data = census - ) - expect_named(x, c('VoterID', 'surname', 'state', 'CD', 'county', - 'tract', 'block', 'precinct', 'age', 'sex', 'party', - 'PID', 'place', 'last', 'first', 'r_whi', 'r_bla', 'r_his', - 'r_asi', 'r_oth')) -}) diff --git a/tests/testthat/test-census_helper_v2.R b/tests/testthat/test-census_helper_v2.R deleted file mode 100644 index 3ea900d..0000000 --- a/tests/testthat/test-census_helper_v2.R +++ /dev/null @@ -1,91 +0,0 @@ -# Note: must provide a valid U.S. Census API key for test cases that use U.S. Census statistics -# > usethis::edit_r_profile -# Sys.setenv("CENSUS_API_KEY" = "yourkey") -# For testing package coverage use: Sys.setenv("NOT_CRAN" = "TRUE") -options("piggyback.verbose" = FALSE) -options("wru_data_wd" = TRUE) - -skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) -test_that("Fails if 'precinct' is set as the geo var",{ - skip_on_cran() - set.seed(42) - data(voters) - future::plan(future::multisession) - census <- readRDS(test_path("data/new_census_table_NJ_2020.rds")) - expect_error( - census_helper_new( - voter.file = voters, - states = "all", - geo = "precinct", - age = FALSE, - sex = FALSE, - year = "2020", - census.data = census, - retry = 3, - use.counties = FALSE, - skip_bad_geos = FALSE - ), - "Error: census_helper_new function does not currently support precinct-level data.") -}) - -skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) -test_that("helper returns verified census tract data",{ - skip_on_cran() - set.seed(42) - data(voters) - future::plan(future::multisession) - census <- readRDS(test_path("data/new_census_table_NJ_2020.rds")) - x <- census_helper_new( - voter.file = voters, - states = "NJ", - geo = "tract", - age = FALSE, - sex = FALSE, - year = "2020", - census.data = census, - retry = 3, - use.counties = FALSE, - skip_bad_geos = FALSE - ) - expect_equal(x[x$surname == "Lopez", "r_whi"], 0.767197, tolerance = 0.000001) - expect_equal(x[x$surname == "Khanna", "r_whi"], 0.708026, tolerance = 0.000001) - expect_equal(x[x$surname == "Lopez", "r_bla"], 0.09522743, tolerance = 0.000001) - expect_equal(x[x$surname == "Khanna", "r_bla"], 0.09544469, tolerance = 0.000001) -}) - -skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) -test_that("New tables and legacy tables return equal race predictions",{ - skip_on_cran() - set.seed(42) - data(voters) - future::plan(future::multisession) - # legacy redistricting table - census <- readRDS(test_path("data/census_test_nj_block_2020.rds")) - x <- census_helper_new( - voter.file = voters, - states = "NJ", - geo = "tract", - age = FALSE, - sex = FALSE, - year = "2020", - census.data = census, - use.counties = FALSE - ) - # use new table source - new_census <- readRDS(test_path("data/new_census_table_NJ_2020.rds")) - y <- census_helper_new( - voter.file = voters, - states = "NJ", - geo = "tract", - age = FALSE, - sex = FALSE, - year = "2020", - census.data = new_census, - use.counties = FALSE - ) - expect_equal(x$r_whi, y$r_whi, tolerance = .01) - # expect_equal(x$r_bla, y$r_bla, tolerance = .01) - expect_equal(x$r_his, y$r_his, tolerance = .01) - expect_equal(x$r_asi, y$r_asi, tolerance = .01) - # expect_equal(x$r_oth, y$r_oth, tolerance = .01) -}) diff --git a/tests/testthat/test-format_legacy_data.R b/tests/testthat/test-format_legacy_data.R deleted file mode 100644 index d5adc6a..0000000 --- a/tests/testthat/test-format_legacy_data.R +++ /dev/null @@ -1,9 +0,0 @@ -options("piggyback.verbose" = FALSE) -options("wru_data_wd" = TRUE) - -test_that("legacy data returns expected geo groups",{ - skip_on_cran() - de <- format_legacy_data(PL94171::pl_url('DE', 2020), state = "DE") - - expect_named(de, c("county", "tract", "blockGroup", "block")) -}) diff --git a/tests/testthat/test-get_census_data.R b/tests/testthat/test-get_census_data.R index 598890e..89e7cb9 100644 --- a/tests/testthat/test-get_census_data.R +++ b/tests/testthat/test-get_census_data.R @@ -5,7 +5,7 @@ if(Sys.getenv("CENSUS_API_KEY") != "") { r <- suppressMessages(get_census_data( key = NULL, state = c("DC"), - census.geo = "block" + census.geo = "block", )) expect_named(r$DC, c("state", "age", "sex", "year", "block", "tract", "county")) @@ -13,7 +13,6 @@ if(Sys.getenv("CENSUS_API_KEY") != "") { }) test_that("Census block_group download works", { - future::plan(future::multisession) r <- suppressMessages(get_census_data( key = NULL, state = "RI", @@ -24,7 +23,6 @@ if(Sys.getenv("CENSUS_API_KEY") != "") { }) test_that("Census tract download works", { - future::plan(future::multisession) r <- suppressMessages(get_census_data( key = NULL, state = c("NY"), @@ -37,7 +35,6 @@ if(Sys.getenv("CENSUS_API_KEY") != "") { }) test_that("Census county download works", { - future::plan(future::multisession) r <- suppressMessages(get_census_data( key = NULL, state = "NJ", @@ -49,7 +46,6 @@ if(Sys.getenv("CENSUS_API_KEY") != "") { }) test_that("Census place download works", { - future::plan(future::multisession) r <- suppressMessages(get_census_data( key = NULL, state = "RI", @@ -59,22 +55,4 @@ if(Sys.getenv("CENSUS_API_KEY") != "") { expect_true(all(r$RI$place$state == "RI")) }) - test_that("Census ZCTA download works", { - r <- suppressMessages(get_census_data( - key = NULL, - state = "DC", - census.geo = "zcta" - )) - expect_named(r$DC, c("state", "age", "sex", "year", "zcta")) - expect_true(all(r$DC$zcta$state == "DC")) - - r <- suppressMessages(get_census_data( - key = NULL, - state = "DC", - census.geo = "zcta", - year = "2010" - )) - expect_named(r$DC, c("state", "age", "sex", "year", "zcta")) - expect_true(all(r$DC$zcta$state == "DC")) - }) } diff --git a/tests/testthat/test-predict_race_2010.R b/tests/testthat/test-predict_race_2010.R index a48031c..dbff9f0 100644 --- a/tests/testthat/test-predict_race_2010.R +++ b/tests/testthat/test-predict_race_2010.R @@ -14,7 +14,7 @@ test_that("Tests surname only predictions", { year = 2010, surname.only = TRUE)) # Test and confirm prediction output is as expected - expect_equal(dim(x), c(10, 20)) + expect_equal(dim(x), c(10, 21)) expect_equal(sum(is.na(x)), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.045, tolerance = 0.01) expect_equal(round(x[x$surname == "Johnson", "pred.his"], 4), 0.0272, tolerance = 0.01) @@ -33,7 +33,7 @@ test_that("Test BISG NJ at county level", { )) expect_equal(as.character(x$VoterID), as.character(c(1, 2, 4, 5, 6, 8, 9))) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x)), 0L) expect_equal(sum(x$surname == "Johnson"), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0314, tolerance = 0.01) @@ -82,7 +82,7 @@ test_that("BISG NJ at block level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0L) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.7640, tolerance = 0.01) @@ -108,7 +108,7 @@ test_that("BISG NJ at block_group level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 21)) + expect_equal(dim(x), c(7, 22)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9183, tolerance = 0.01) @@ -125,7 +125,7 @@ test_that("Fails on territories", { ) }) -test_that("Fails on missing geolocation if skip_bad_geos default is used", { +test_that("Fails on missing geolocation", { skip_on_cran() set.seed(42) data(voters) @@ -138,25 +138,8 @@ test_that("Fails on missing geolocation if skip_bad_geos default is used", { census.data = census, use.counties = TRUE) ), - "Stopping predictions. Please revise" - ) -}) - -test_that("Skip_bad_geos option successfully returns working geolocations", { - skip_on_cran() - set.seed(42) - data(voters) - census <- readRDS(test_path("data/census_test_nj_block_2010.rds")) - test_drop <- suppressMessages(predict_race( - voter.file = voters[voters$state == "NJ", ], - year = 2010, - census.geo = "block", - census.key = NULL, - census.data = census, - skip_bad_geos = TRUE, - use.counties = TRUE) + "The following locations in the voter\\.file are not available" ) - expect_equal(nrow(test_drop), 6) }) test_that("Handles zero-pop. geolocations", { @@ -173,7 +156,7 @@ test_that("Handles zero-pop. geolocations", { census.data = census, use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.91, tolerance = 0.01) @@ -182,7 +165,7 @@ test_that("Handles zero-pop. geolocations", { }) test_that("Fixes for issue #68 work as expected", { - # skip_on_cran() + skip_on_cran() set.seed(42) surname <- c("SULLIVAN") one <- predict_race(voter.file=data.frame(surname), year = 2010, surname.only=TRUE) diff --git a/tests/testthat/test-predict_race_2020.R b/tests/testthat/test-predict_race_2020.R index 89f8962..65b8c75 100644 --- a/tests/testthat/test-predict_race_2020.R +++ b/tests/testthat/test-predict_race_2020.R @@ -4,18 +4,6 @@ options("piggyback.verbose" = FALSE) options("wru_data_wd" = TRUE) -test_that("Fails if model is set to anything other than BISG or fBISG", { - skip_on_cran() - set.seed(42) - data(voters) - expect_error(suppressMessages(predict_race( - voter.file = voters, - surname.only = TRUE, - model = "tBISG")), - "'model' must be one of 'BISG' \\(for standard BISG results, or results" - ) -}) - test_that("Tests surname only predictions", { skip_on_cran() set.seed(42) @@ -25,7 +13,7 @@ test_that("Tests surname only predictions", { voter.file = voters, surname.only = TRUE)) # Test and confirm prediction output is as expected - expect_equal(dim(x), c(10, 20)) + expect_equal(dim(x), c(10, 21)) expect_equal(sum(is.na(x)), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.045, tolerance = 0.01) expect_equal(round(x[x$surname == "Johnson", "pred.his"], 4), 0.0272, tolerance = 0.01) @@ -43,7 +31,7 @@ test_that("Test BISG NJ at county level", { )) expect_equal(as.character(x$VoterID), as.character(c(1, 2, 4, 5, 6, 8, 9))) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x)), 0L) expect_equal(sum(x$surname == "Johnson"), 0) expect_equal(round(x[x$surname == "Khanna", "pred.whi"], 4), 0.0181, tolerance = 0.01) @@ -90,7 +78,7 @@ test_that("BISG NJ at block level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0L) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.8078, tolerance = 0.01) @@ -115,7 +103,7 @@ test_that("BISG NJ at block_group level", { use.counties = TRUE) ) - expect_equal(dim(x), c(7, 21)) + expect_equal(dim(x), c(7, 22)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9374, tolerance = 0.01) @@ -132,7 +120,7 @@ test_that("Fails on territories", { ) }) -test_that("Fails on missing geolocation if skip_bad_geos default is used", { +test_that("Fails on missing geolocation", { skip_on_cran() set.seed(42) data(voters) @@ -144,26 +132,10 @@ test_that("Fails on missing geolocation if skip_bad_geos default is used", { census.data = census, use.counties = TRUE) ), - "Stopping predictions. Please revise" + "The following locations in the voter\\.file are not available" ) }) -test_that("Skip_bad_geos option successfully returns working geolocations", { - skip_on_cran() - set.seed(42) - data(voters) - census <- readRDS(test_path("data/census_test_nj_block_2020.rds")) - test_drop <- suppressMessages(predict_race( - voter.file = voters[voters$state == "NJ", ], - census.geo = "block", - census.key = NULL, - census.data = census, - skip_bad_geos = TRUE, - use.counties = TRUE) - ) - expect_equal(nrow(test_drop), 1) - }) - test_that("Handles zero-pop. geolocations", { skip_on_cran() set.seed(42) @@ -177,7 +149,7 @@ test_that("Handles zero-pop. geolocations", { census.data = census, use.counties = TRUE) ) - expect_equal(dim(x), c(7, 20)) + expect_equal(dim(x), c(7, 21)) expect_equal(sum(is.na(x$pred.asi)), 0) expect_true(!any(duplicated(x$surname))) expect_equal(x[x$surname == "Khanna", "pred.asi"], 0.9444, tolerance = 0.01) diff --git a/tests/testthat/test-race_prediction_funs.R b/tests/testthat/test-race_prediction_funs.R deleted file mode 100644 index af02f13..0000000 --- a/tests/testthat/test-race_prediction_funs.R +++ /dev/null @@ -1,32 +0,0 @@ -# Note: must provide a valid U.S. Census API key for test cases that use U.S. Census statistics -# > usethis::edit_r_profile -# Sys.setenv("CENSUS_API_KEY" = "yourkey") -# For testing package coverage use: Sys.setenv("NOT_CRAN" = "TRUE") -options("piggyback.verbose" = FALSE) -options("wru_data_wd" = TRUE) - -skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) -test_that("old predict_race fxn returns sensible predictions for test names", { - data("voters") - x <- .predict_race_old( - voter.file = voters, - census.surname = TRUE, - surname.only = TRUE, - surname.year = 2010, - census.geo = "tract", - census.key = Sys.getenv("CENSUS_API_KEY"), - age = FALSE, - sex = FALSE, - year = "2010", - retry = 3, - impute.missing = TRUE, - use.counties = FALSE - ) - expect_equal(x[x$surname == "Lopez", "pred.whi"], 0.0486000, tolerance = .000001) - expect_equal(x[x$surname == "Khanna", "pred.whi"], 0.0676000, tolerance = .000001) - expect_equal(x[x$surname == "Lopez", "pred.bla"], 0.00570000, tolerance = .000001) - expect_equal(x[x$surname == "Khanna", "pred.bla"], 0.00430000, tolerance = .000001) - expect_equal(x[x$surname == "Lopez", "pred.his"], 0.92920000, tolerance = .000001) #assumed to be high Hispanic score - expect_equal(x[x$surname == "Zhou", "pred.asi"], 0.98200000, tolerance = .000001) #assumed to be high Asian score - -}) diff --git a/tests/testthat/test-rollup.R b/tests/testthat/test-rollup.R deleted file mode 100644 index cd4d533..0000000 --- a/tests/testthat/test-rollup.R +++ /dev/null @@ -1,164 +0,0 @@ -if(Sys.getenv("CENSUS_API_KEY") != "") { - test_that('Roll-ups of proportions between levels for race behave as expected', { - skip_on_cran() - future::plan(future::multisession) - r <- suppressMessages(get_census_data( - key = NULL, - state = c("DE"), - census.geo = "block", - )) - - r_tract_from_block <- r$DE$block |> - dplyr::group_by(tract) |> - dplyr::summarize( - sum_whi = sum(r_whi), - sum_bla = sum(r_bla), - sum_his = sum(r_his), - sum_asi = sum(r_asi), - sum_oth = sum(r_oth) - ) - - r_tract_level <- r$DE$tract |> - dplyr::select(tract, r_whi:r_oth) |> - dplyr::group_by(tract) |> - dplyr::summarize( - sum_whi = sum(r_whi), - sum_bla = sum(r_bla), - sum_his = sum(r_his), - sum_asi = sum(r_asi), - sum_oth = sum(r_oth) - ) - - r_county_from_tract <- r$DE$tract |> - dplyr::select(county, r_whi:r_oth) |> - dplyr::group_by(county) |> - dplyr::summarize( - sum_whi = sum(r_whi), - sum_bla = sum(r_bla), - sum_his = sum(r_his), - sum_asi = sum(r_asi), - sum_oth = sum(r_oth) - ) - - r_county_level <- r$DE$county |> - dplyr::select(county, r_whi:r_oth) |> - dplyr::group_by(county) |> - dplyr::summarize( - sum_whi = sum(r_whi), - sum_bla = sum(r_bla), - sum_his = sum(r_his), - sum_asi = sum(r_asi), - sum_oth = sum(r_oth) - ) - - expect_true( - all.equal( - colSums(r$DE$block[, c('r_whi', 'r_bla', 'r_his', 'r_asi', 'r_oth')]), - c('r_whi' = 1, 'r_bla' = 1, 'r_his' = 1, 'r_asi' = 1, 'r_oth' = 1) - ) - ) - expect_equal(r_county_from_tract, r_county_level, tolerance = 1e-7) - expect_equal(r_tract_level, r_tract_from_block, tolerance = 1e-7) - }) - - test_that('Roll-ups of population sums between levels for race behave as expected', { - future::plan(future::multisession) - r <- suppressMessages(get_census_data( - key = NULL, - state = c("WY"), - census.geo = "block", - )) - - r_sum_from_block <- r$WY$block |> - dplyr::select(-state, -county, -tract, -block, -dplyr::starts_with("r_")) |> - colSums() - - r_sum_from_tract <- r$WY$tract |> - dplyr::select(-state, -county, -tract, -dplyr::starts_with("r_")) |> - colSums() - - r_sum_from_county <- r$WY$county |> - dplyr::select(-state, -county, -dplyr::starts_with("r_")) |> - colSums() - - expect_equal(r_sum_from_block, r_sum_from_tract) - expect_equal(r_sum_from_block, r_sum_from_county) - - r_zcta_level <- suppressMessages(get_census_data( - key = NULL, - state = "WY", - census.geo = "zcta" - )) - - r_sum_from_zcta <- r_zcta_level$WY$zcta |> - dplyr::select(-state, -zcta, -dplyr::starts_with("r_")) |> - colSums() - - expect_equal(r_sum_from_block, r_sum_from_zcta, tolerance = 0.001) - }) - - test_that('Roll-ups for ZIP and county race sum to same for state', { - r_zcta_level <- suppressMessages(get_census_data( - key = NULL, - state = "RI", - census.geo = "zcta" - )) - - r_county_level <- suppressMessages(get_census_data( - key = NULL, - state = "RI", - census.geo = "county" - )) - - expect_equal( - r_zcta_level$RI$zcta |> - dplyr::select(-state, -zcta, -dplyr::starts_with("r_")) |> - colSums(), - r_county_level$RI$county |> - dplyr::select(-state, -county, -dplyr::starts_with("r_")) |> - colSums(), - tolerance = 0.001 - ) - }) - - test_that('Roll-ups of population sums between levels for race/sex/age behave as expected', { - skip_on_cran() - future::plan(future::multisession) - r <- suppressMessages(get_census_data( - key = NULL, - state = c("AK"), - census.geo = "block", - sex = TRUE, - age = TRUE - )) - - r_sum_from_block <- r$AK$block |> - dplyr::select(dplyr::starts_with("r_")) |> - colSums() - - r_sum_from_tract <- r$AK$tract |> - dplyr::select(dplyr::starts_with("r_")) |> - colSums() - - r_sum_from_county <- r$AK$county |> - dplyr::select(dplyr::starts_with("r_")) |> - colSums() - - expect_equal(r_sum_from_block, r_sum_from_tract) - expect_equal(r_sum_from_block, r_sum_from_county) - - r_zcta_level <- suppressMessages(get_census_data( - key = NULL, - state = "AK", - census.geo = "zcta", - sex = TRUE, - age = TRUE - )) - - r_sum_from_zcta <- r_zcta_level$AK$zcta |> - dplyr::select(dplyr::starts_with("r_")) |> - colSums() - - expect_equal(r_sum_from_block, r_sum_from_zcta, tolerance = 0.001) - }) -} diff --git a/tests/testthat/wru-data-census_last_c.rds b/tests/testthat/wru-data-census_last_c.rds new file mode 100644 index 0000000..b926aaf Binary files /dev/null and b/tests/testthat/wru-data-census_last_c.rds differ diff --git a/tests/testthat/wru-data-first_c.rds b/tests/testthat/wru-data-first_c.rds new file mode 100644 index 0000000..3b364a7 Binary files /dev/null and b/tests/testthat/wru-data-first_c.rds differ diff --git a/tests/testthat/wru-data-last_c.rds b/tests/testthat/wru-data-last_c.rds new file mode 100644 index 0000000..57e7bce Binary files /dev/null and b/tests/testthat/wru-data-last_c.rds differ diff --git a/tests/testthat/wru-data-mid_c.rds b/tests/testthat/wru-data-mid_c.rds new file mode 100644 index 0000000..0228974 Binary files /dev/null and b/tests/testthat/wru-data-mid_c.rds differ