From fe3bab165c69eec90332b9c05da8d138a837a615 Mon Sep 17 00:00:00 2001 From: Rustam Date: Thu, 15 Jan 2026 14:05:17 +0100 Subject: [PATCH 1/2] updated roxxygen for check_heaping_general, also fixed some warnings in opag. Looks like someone has updated the file. --- DESCRIPTION | 2 +- NAMESPACE | 2 +- R/input_diagnostics.R | 14 +++-- R/odap.R | 76 +++++++++++++------------- man/check_heaping_general.Rd | 2 +- man/movepop.Rd | 100 ----------------------------------- 6 files changed, 51 insertions(+), 145 deletions(-) delete mode 100644 man/movepop.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 5d89616..ca1c74a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ License: file LICENSE LazyLoad: yes LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.1), ggplot2 diff --git a/NAMESPACE b/NAMESPACE index 8958af9..306e786 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,7 +36,6 @@ export(lt_flexible_chunk) export(lt_plot) export(lt_summary) export(lt_summary_chunk) -export(movepop) export(odap_opag) export(plot_compare_rates) export(plot_histogram) @@ -109,6 +108,7 @@ importFrom(dplyr,last) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) +importFrom(dplyr,pick) importFrom(dplyr,pull) importFrom(dplyr,reframe) importFrom(dplyr,rename) diff --git a/R/input_diagnostics.R b/R/input_diagnostics.R index ee882cd..7cc4a0b 100644 --- a/R/input_diagnostics.R +++ b/R/input_diagnostics.R @@ -2,11 +2,15 @@ #' @description Check the age heaping for 5 or 1 year data. #' @param data data.frame. User file from the read_data command with the minimum data on `Exposures`, `Death` and `Age`. Data can be both in 5 and 1 year age intervals #' @param y character.Variable name for which the heaping should be checked `Deaths` or `Exposures`. -#' @return A data.frame with 2 columns `method` - the method used for age heaping evaluation and `result` - the resulting heaping measure +#' @return A data.frame with 3 columns `method` - the method used for age heaping evaluation, `result` - the resulting heaping measure, and `.id` - user specified groups identifier (if missing is set to "all") +#' @importFrom tibble tibble +#' @importFrom tidyr unnest pivot_longer +#' @importFrom tidyselect all_of #' @importFrom stringr str_detect -#' @importFrom dplyr bind_rows left_join join_by +#' @importFrom dplyr bind_rows left_join select group_nest mutate pull group_by #' @importFrom rlang .data -#' @importFrom DemoTools check_heaping_roughness check_heaping_bachi check_heaping_myers check_heaping_sawtooth +#' @importFrom purrr map map2 +#' @importFrom DemoTools is_single check_heaping_roughness check_heaping_bachi check_heaping_myers check_heaping_sawtooth #' @export #' @examples #' \dontrun{ @@ -154,8 +158,8 @@ check_heaping_general <- function(data, y) { Value = pull(.x, !!sym(y)), Age = .x$Age, ageMin = 30)), - "res" = map2(.x = roughness, - .y = sawtooth, ~ + "res" = map2(.x = .data$roughness, + .y = .data$sawtooth, ~ bind_rows( classify( x = .x, diff --git a/R/odap.R b/R/odap.R index b28553f..26fd8ea 100644 --- a/R/odap.R +++ b/R/odap.R @@ -16,7 +16,7 @@ #' @param year Numeric vector of years to filter by (default \code{1971}). #' @param sex Character scalar indicating sex to filter by, e.g., \code{"M"} or \code{"F"} (default \code{"M"}). #' @param i18n Optional internationalization object for translating plot labels and titles (default \code{NULL}). -#' @importFrom dplyr filter arrange across select mutate group_by reframe ungroup full_join group_nest +#' @importFrom dplyr filter arrange across select mutate group_by reframe ungroup full_join group_nest pick #' @importFrom tibble as_tibble #' @importFrom DemoTools lt_single_mx OPAG groupAges #' @importFrom tidyr pivot_longer @@ -159,7 +159,7 @@ odap_opag <- function(data_in = NULL, conditional_filter("country_code", country_code) %>% conditional_filter("name", name) %>% conditional_filter("year", year) %>% - conditional_filter("sex", sex) %>% + conditional_filter("sex", sex) %>% select(any_of(c("name", "country_code", "sex", "year", "age", "nlx"))) # Rename nlx back to nLx for consistency with downstream code if("nlx" %in% names(nLx)) { @@ -184,36 +184,38 @@ odap_opag <- function(data_in = NULL, } + # add env solution suppressPackageStartupMessages(library(latest_wpp, character.only = TRUE)) - data("mx1dt", package = latest_wpp) + + env <- new.env() + data("mx1dt", package = latest_wpp, envir = env) - nLx <- mx1dt %>% + nLx <- env$mx1dt %>% as_tibble() %>% - select(-.data$mxB) %>% + select(-c("mxB")) %>% conditional_filter("country_code", country_code) %>% conditional_filter("name", name) %>% conditional_filter("year", year) %>% pivot_longer( - cols = c(.data$mxM, .data$mxF), - names_to = "sex", + cols = c("mxM", "mxF"), + names_to = "sex", values_to = "mx" ) %>% mutate(sex = substr(.data$sex, 3, 3)) %>% conditional_filter("sex", sex) - group_vars <- intersect(c("name","country_code","sex","year"), names(nLx)) - - nLx <- nLx %>% - group_by(across(all_of(intersect(c("name", "country_code", "sex", "year"), names(.))))) %>% + nLx <- nLx %>% + group_by(across(any_of(c("name", "country_code", "sex", "year")))) %>% reframe( - lt_single_mx(nMx = .data$mx, Age = .data$age), .groups = "drop") %>% - select(.data$name, - .data$country_code, - .data$sex, - .data$year, - age = .data$Age, - .data$AgeInt, - .data$nLx) + lt_single_mx(nMx = .data$mx, + Age = .data$age), .groups = "drop") %>% + select("name", + "country_code", + "sex", + "year", + "age" = "Age", + "AgeInt", + "nLx") # # nLx <- nLx %>% # as_tibble() %>% @@ -237,7 +239,7 @@ odap_opag <- function(data_in = NULL, conditional_filter("country_code", country_code) %>% conditional_filter("name", name) %>% conditional_filter("year", year) %>% - conditional_filter("sex", sex) %>% + conditional_filter("sex", sex) %>% select(any_of(c("name", "country_code", "sex", "year", "age", "AgeInt", "nLx"))) } @@ -261,7 +263,7 @@ odap_opag <- function(data_in = NULL, nLx <- nLx %>% - group_by(across(all_of(intersect(c("name", "country_code", "sex", "year"), names(.))))) %>% + group_by(across(any_of(c("name", "country_code", "sex", "year")))) %>% reframe({ dat <- pick(everything()) age_vals <- seq(0, max(dat$age), by = age_spacing) @@ -269,9 +271,7 @@ odap_opag <- function(data_in = NULL, tibble(age = age_vals, nLx = nLx_vals) }) - } else { } - # ---------------------------------------------------------------------------- # # Part 3: Now we have user data and reference data and we can use the OPAG function @@ -359,21 +359,23 @@ odap_opag <- function(data_in = NULL, Pop_vals <- .x$pop nLx_vals <- .x$nLx OPAG( - Pop = Pop_vals, - Age_Pop = Age_vals, - nLx = nLx_vals, - Age_nLx = Age_vals, - Age_fit = Age_fit, - AgeInt_fit = AgeInt_fit, + Pop = Pop_vals, + Age_Pop = Age_vals, + nLx = nLx_vals, + Age_nLx = Age_vals, + Age_fit = Age_fit, + AgeInt_fit = AgeInt_fit, Redistribute_from = Redistribute_from, - OAnew = OAnew, - method = method + OAnew = OAnew, + method = method ) }), plots = map2(.data$data, .data$results, function(.x, .y) { - old <- tibble(pop = .x$pop, age = .x$age) %>% + old <- tibble(pop = .x$pop, + age = .x$age) %>% filter(.data$age > Redistribute_from) - new <- tibble(pop = .y$Pop_out, age = .y$Age_out) %>% + new <- tibble(pop = .y$Pop_out, + age = .y$Age_out) %>% filter(.data$age > Redistribute_from) # Note: translated text variables (title_text, age_text, etc.) @@ -383,21 +385,21 @@ odap_opag <- function(data_in = NULL, # Rename columns to translated versions for hover tooltips (following lifetable pattern) names(old)[names(old) == "age"] <- age_text names(old)[names(old) == "pop"] <- pop_text - old[[type_text]] <- original_text + old[[type_text]] <- original_text names(new)[names(new) == "age"] <- age_text names(new)[names(new) == "pop"] <- pop_text - new[[type_text]] <- redistributed_text + new[[type_text]] <- redistributed_text # Create named vector for colors using translated keys - color_values <- c("black", "red") + color_values <- c("black", "red") names(color_values) <- c(original_text, redistributed_text) # Build plot using .data[[]] for runtime evaluation (NOT !!sym()) ggplot() + geom_point(data = old, aes(x = .data[[age_text]], y = .data[[pop_text]], color = .data[[type_text]]), size = 2) + - geom_line(data = new, aes(x = .data[[age_text]], y = .data[[pop_text]], color = .data[[type_text]]), linewidth = 1) + + geom_line( data = new, aes(x = .data[[age_text]], y = .data[[pop_text]], color = .data[[type_text]]), linewidth = 1) + scale_color_manual(name = type_text, values = color_values) + labs(x = age_text, y = pop_text, title = title_text) + theme_minimal(base_size = 14) + diff --git a/man/check_heaping_general.Rd b/man/check_heaping_general.Rd index b67b4ec..adaaa2a 100644 --- a/man/check_heaping_general.Rd +++ b/man/check_heaping_general.Rd @@ -12,7 +12,7 @@ check_heaping_general(data, y) \item{y}{character.Variable name for which the heaping should be checked \code{Deaths} or \code{Exposures}.} } \value{ -A data.frame with 2 columns \code{method} - the method used for age heaping evaluation and \code{result} - the resulting heaping measure +A data.frame with 3 columns \code{method} - the method used for age heaping evaluation, \code{result} - the resulting heaping measure, and \code{.id} - user specified groups identifier (if missing is set to "all") } \description{ Check the age heaping for 5 or 1 year data. diff --git a/man/movepop.Rd b/man/movepop.Rd deleted file mode 100644 index 51f8dee..0000000 --- a/man/movepop.Rd +++ /dev/null @@ -1,100 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/movepop.R -\name{movepop} -\alias{movepop} -\title{Project Population Between Two Dates Using Age-Specific Rates} -\usage{ -movepop( - initial_date, - desired_date, - male_pop, - female_pop, - male_mx, - female_mx, - asfr, - annual_net_migrants = 0, - age_groups = NULL, - age_format = "auto" -) -} -\arguments{ -\item{initial_date}{Numeric or Date. The starting year/date of the population.} - -\item{desired_date}{Numeric or Date. The year/date to which the population is projected.} - -\item{male_pop}{Numeric vector of male population counts by age.} - -\item{female_pop}{Numeric vector of female population counts by age.} - -\item{male_mx}{Numeric vector of male mortality rates by age.} - -\item{female_mx}{Numeric vector of female mortality rates by age.} - -\item{asfr}{Numeric vector of age-specific fertility rates corresponding to female ages.} - -\item{annual_net_migrants}{Numeric scalar for annual net migration to include in growth (default 0).} - -\item{age_groups}{Optional character vector of age group labels. If \code{NULL}, default labels are assigned.} - -\item{age_format}{Character specifying age structure: \code{"single_year"}, \code{"five_year"}, or \code{"auto"} (default \code{"auto"}). When \code{"auto"}, the function infers format from the length of population vectors.} -} -\value{ -A list with three elements: -\describe{ -\item{\code{initial_summaries}}{A tibble summarising total births, deaths, crude rates, growth rate, total population, and adjustment factor.} -\item{\code{projected_summaries}}{A tibble summarising projected total population by sex and overall sex ratio.} -\item{\code{data}}{A tibble with age-specific populations, ASFR, births, deaths, projected populations by sex, both-sexes totals, and sex ratios.} -} -} -\description{ -The \code{movepop()} function projects a population from an initial date to a desired date -using age-specific fertility (ASFR) and mortality rates. It supports single-year or -five-year age group formats and optionally accounts for net migration. -The function produces both projected population by age and sex and summary statistics. -} -\details{ -\itemize{ -\item The function ensures that population and mortality vectors have equal lengths. -\item If \code{age_format} is \code{"auto"}, lengths of 18 imply \code{"five_year"} and lengths ≥101 imply \code{"single_year"}. -\item Age groups are automatically generated if not provided, and ASFR values are aligned starting at the age group \code{"15"} (or equivalent). -\item Projected populations are computed using exponential growth based on crude birth rate, crude death rate, and net migration rate. -\item The function returns both age-specific projected populations and summary statistics. -} -} -\examples{ -\dontrun{ -male_pop <- c(48875, 164390, 173551, 130297, 101143, 73615, 60594, 55175, - 49530, 46562, 39028, 27837, 22110, 18066, 15340, 13318, -#' 12002, 6424) - -female_pop <- c(47105, 159546, 168760, 119437, 92080, 70515, 58801, 53381, - 46757, 41164, 33811, 24121, 19315, 16319, 14058, 12302, - 11047, 5922) - -male_mx <- c(0.12427, 0.01639, 0.00274, 0.00167, 0.00251, 0.00380, 0.00382, - 0.00442, 0.00506, 0.00663, 0.00872, 0.01240, 0.01783, 0.02700, - 0.04126, 0.06785, 0.11287, 0.21015) - -female_mx <- c(0.11050, 0.01577, 0.00254, 0.00159, 0.00232, 0.00304, 0.00344, - 0.00370, 0.00418, 0.00492, 0.00592, 0.00831, 0.01182, 0.01942, - 0.03221, 0.05669, 0.09771, 0.19385) - -asfr <- c(0.199, 0.478, 0.418, 0.321, 0.163, 0.071, 0.028) - -res <- movepop( -initial_date = 1973.58, -desired_date = 1973.50, -male_pop = male_pop, -female_pop = female_pop, -male_mx = male_mx, -female_mx = female_mx, -asfr = asfr, -annual_net_migrants = -50000, -age_format = "five_year" - -res$initial_summaries -res$projected_summaries -head(res$data) -} - -} From 39af45051affc0a9b9c0f0d68aec0c676a0edc26 Mon Sep 17 00:00:00 2001 From: Rustam Date: Tue, 17 Feb 2026 18:20:40 +0100 Subject: [PATCH 2/2] Added functions: check_data_graduate_time check_data_heaping check_data_lifetable check_heaping_general can take rate proportion now graduate_time added smooth1d can take rate proportion now and is updated to have transformation minor adjustments to namespace and description and etc --- DESCRIPTION | 4 + NAMESPACE | 15 +- R/checkers.R | 333 ++++++++++++++++++- R/graduation.R | 375 ++++++++++++++++++++-- R/input_diagnostics.R | 370 ++++++++++++++------- R/odap.R | 14 +- R/smoothing1d.R | 552 +++++++++++++++----------------- man/check_data_graduate_time.Rd | 35 ++ man/check_data_heaping.Rd | 51 +++ man/check_data_lifetable.Rd | 67 ++++ man/check_heaping_general.Rd | 51 ++- man/check_smooth.Rd | 77 ----- man/graduate_auto.Rd | 2 +- man/graduate_auto_5.Rd | 2 +- man/graduate_time.Rd | 71 ++++ man/odap_opag.Rd | 5 - man/plot_smooth_compare.Rd | 4 +- man/smooth1d.Rd | 176 +++++----- man/smooth1d_chunk.Rd | 34 -- man/smooth_flexible.Rd | 2 +- man/smooth_flexible_chunk.Rd | 2 +- 21 files changed, 1595 insertions(+), 647 deletions(-) create mode 100644 man/check_data_graduate_time.Rd create mode 100644 man/check_data_heaping.Rd create mode 100644 man/check_data_lifetable.Rd delete mode 100644 man/check_smooth.Rd create mode 100644 man/graduate_time.Rd delete mode 100644 man/smooth1d_chunk.Rd diff --git a/DESCRIPTION b/DESCRIPTION index ca1c74a..772e14b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,6 +23,10 @@ Suggests: Imports: DemoTools (>= 01.13.85), stringr, + codetools, + wpp2024, + forcats, + ungroup, dplyr, scales, readxl, diff --git a/NAMESPACE b/NAMESPACE index 306e786..5185fe7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,9 @@ export(capture_args) export(check_coherent) export(check_data) export(check_data_generic) +export(check_data_graduate_time) +export(check_data_heaping) +export(check_data_lifetable) export(check_data_opag) export(check_groupid) export(check_has_numeric) @@ -23,11 +26,11 @@ export(check_rows) export(check_sequential) export(check_sex) export(check_sex_opag) -export(check_smooth) export(create_groupid) export(extension_check) export(graduate_auto) export(graduate_auto_5) +export(graduate_time) export(interp_linear) export(interpolate) export(lt_check) @@ -47,7 +50,6 @@ export(plot_smooth_compare) export(pyramid) export(read_data) export(smooth1d) -export(smooth1d_chunk) export(smooth_flexible) export(smooth_flexible_chunk) importFrom(DemoTools,OPAG) @@ -62,6 +64,7 @@ importFrom(DemoTools,graduate) importFrom(DemoTools,graduate_mono) importFrom(DemoTools,graduate_uniform) importFrom(DemoTools,groupAges) +importFrom(DemoTools,int2age) importFrom(DemoTools,is_abridged) importFrom(DemoTools,is_age_coherent) importFrom(DemoTools,is_age_redundant) @@ -88,6 +91,7 @@ importFrom(cowplot,plot_grid) importFrom(dplyr,.data) importFrom(dplyr,across) importFrom(dplyr,arrange) +importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) importFrom(dplyr,case_match) importFrom(dplyr,case_when) @@ -116,6 +120,7 @@ importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) +importFrom(forcats,as_factor) importFrom(furrr,future_map_dfr) importFrom(future,multisession) importFrom(future,plan) @@ -134,6 +139,7 @@ importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) importFrom(ggplot2,guide_legend) importFrom(ggplot2,labs) +importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_color_discrete) importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_brewer) @@ -149,6 +155,7 @@ importFrom(ggplot2,theme_minimal) importFrom(grDevices,gray) importFrom(magrittr,"%>%") importFrom(mgcv,gam) +importFrom(mgcv,s) importFrom(parallelly,availableCores) importFrom(purrr,map) importFrom(purrr,map2) @@ -168,10 +175,12 @@ importFrom(scales,comma) importFrom(scales,label_log) importFrom(scales,pretty_breaks) importFrom(signal,interp1) +importFrom(stats,gaussian) importFrom(stats,loess) importFrom(stats,loess.control) importFrom(stats,na.omit) importFrom(stats,predict) +importFrom(stats,qlogis) importFrom(stats,smooth.spline) importFrom(stats,splinefun) importFrom(stats,supsmu) @@ -180,6 +189,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_flatten) importFrom(tibble,as_tibble) importFrom(tibble,lst) +importFrom(tibble,rownames_to_column) importFrom(tibble,tibble) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) @@ -189,6 +199,7 @@ importFrom(tidyselect,all_of) importFrom(tidyselect,any_of) importFrom(tidyselect,everything) importFrom(tidyselect,matches) +importFrom(ungroup,pclm) importFrom(utils,data) importFrom(utils,globalVariables) importFrom(utils,installed.packages) diff --git a/R/checkers.R b/R/checkers.R index 5305986..12076e1 100644 --- a/R/checkers.R +++ b/R/checkers.R @@ -131,8 +131,6 @@ check_numeric <- function(data) { #' data = data) #' -# Should we even ask some of these? Like if there is no AgeInt the reader will not work. - check_missing_cols <- function(data) { # TR: DemoTools has the function age2int() to infer age intervals from an Age vector, # to technically we don't need it. We do however need ages specified as lower bounds of @@ -884,3 +882,334 @@ check_data_opag <- function(data) { return(result) } + + +#' Checks input data for lifetable reconstruction +#' @title Check consistency of lifetable input data +#' @description +#' Validates a data frame intended for lifetable reconstruction. +#' The function checks whether the input contains at least one column that can be used to reconstruct a full lifetable and performs a set of structural and value-based validations. +#' If no grouping variable `.id` is present, one is automatically created with a single level `"all"`. +#' @details +#' The function searches for at least one of the following columns (in order of preference): +#' \itemize{ +#' \item \code{nMx} +#' \item \code{nlx} +#' \item \code{nqx} +#' \item \code{npx} +#' \item \code{ndx} +#' \item \code{nLx} +#' } +#' The first available column is used for validation checks. The following checks are performed: +#' \itemize{ +#' \item Numeric type validation +#' \item Missing value detection +#' \item Non-negativity constraint +#' } +#' If an \code{Age} column is present, additional checks are performed: +#' \itemize{ +#' \item Age coherence (numeric validity) +#' \item Redundancy detection +#' \item Sequential ordering +#' } +#' Validation is performed separately for each \code{.id} group. +#' @param data A data frame containing lifetable-related columns. Must include at least one of: \code{nMx}, \code{nlx}, \code{nqx}, \code{npx}, \code{ndx}, or \code{nLx}. Optionally may contain an \code{Age} column and a grouping column \code{.id}. +#' @importFrom DemoTools age2int is_age_coherent is_age_redundant is_age_sequential +#' @return +#' A data frame summarizing validation results for each group. The returned object contains: +#' \describe{ +#' \item{check}{Name of the validation performed} +#' \item{message}{Error message if the check fails, otherwise \code{NA}} +#' \item{pass}{Either `"Pass"` or `"Fail"`} +#' \item{.id}{Group identifier} +#' } +#' +#' @examples +#' library(readr) +#' fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in$nMx <- data_in$Deaths / data_in$Exposures +#' data_in <- data_in[data_in$`.id` == 1, ] +#' check_data_lifetable(data_in) +#' +#' @seealso +#' \code{\link{age2int}}, +#' \code{\link{is_age_coherent}}, +#' \code{\link{is_age_redundant}}, +#' \code{\link{is_age_sequential}} +#' +#' @export +#' + +check_data_lifetable <- function(data) { + + # Ensure '.id' column exists + if (!(".id" %in% names(data))) { + data$.id <- "all" + } + + # We can reconstruct lifetable from any of these columns + # We do not need ax strictly speaking, we cannot reconstruct from Tx and ex + # NOTE: I arranged columns in a order of preference from the best to the worst + # i.e. I prefer to construct LT from nMx, then from nlx etc. + # now I can choose the one column to check very easily + lt_cols <- c("nMx", "nlx", "nqx", "npx", "ndx", "nLx") + present <- intersect(lt_cols, names(data))[1] + # If at lease one column is present we can build a litable + # lets perform a series fo checks + if(is.na(present)) { + + stop( + "Your data does not contain columns that can be used to reconstruct a full lifetable.\n", + "It must contain at least one of the following: ", + paste(lt_cols, collapse = ", "), + call. = FALSE + ) + + } + + split_data <- split(data, data$.id) + + check_one <- function(d) { + + # choose column to work with + x <- d[[present]] + + # Perform checks + checks <- c( + check_numeric = !is.numeric(x), + check_missing = any(is.na(x)), + check_negative = any(x < 0, na.rm = TRUE) + ) + + messages <- c( + check_numeric = sprintf("Column %s must be numeric", present), + check_missing = sprintf("Column %s should not contain missing values", present), + check_negative = sprintf("All values in column %s should be >= 0", present) + ) + + out <- data.frame( + check = paste0(names(checks)," (", present, ")"), + message = ifelse(checks, messages, NA_character_)) + + out$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + if("Age" %in% names(d)) { + + Age <- d$Age + AgeInt <- age2int(Age) + + checks_age <- c( + check_coherent = !is_age_coherent(Age, AgeInt), + check_redundant = is_age_redundant(Age, AgeInt), + check_sequential = !is_age_sequential(Age) + ) + + messages_age <- c( + check_coherent = "Age values must be numeric", + check_redundant = "Age values are redundant", + check_sequential = "Age values should be sequential" + ) + + out_age <- data.frame( + check = names(checks_age), + message = ifelse(checks_age, messages_age, NA_character_)) + + out_age$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + out <- rbind(out, out_age) + + } + + return(out) + + } + + res <- + do.call(rbind, lapply(names(split_data), \(id) { + cbind(check_one(split_data[[id]]), .id = id) + })) + + return(res) + +} + + +#' Validate input data for age heaping analysis +#' @title Check consistency of data for age heaping diagnostics +#' @description +#' Validates a data frame prior to performing age heaping analysis. +#' The function checks whether a specified variable contains valid numeric values and ensures that required structural conditions are satisfied. +#' If a grouping variable \code{.id} is not present, one is automatically created with a single level `"all"`. +#' Validation is performed separately for each \code{.id} group. +#' @param data A data frame containing the variable to be evaluated. +#' Optionally may contain a grouping column \code{.id}. +#' @param X A character string specifying the name of the variable to be checked (e.g., `"Deaths"`). +#' @details +#' For the selected variable \code{X}, the following checks are performed: +#' \itemize{ +#' \item Numeric type validation +#' \item Missing value detection +#' \item Infinite value detection +#' \item Strict positivity constraint (all values must be > 0) +#' \item Presence of an \code{Age} column (required for age heaping analysis) +#' } +#' Each validation is conducted within levels of \code{.id}. +#' @return +#' A data frame summarizing validation results for each group. +#' The returned object contains: +#' \describe{ +#' \item{check}{Name of the validation performed} +#' \item{message}{Error message if the check fails, otherwise \code{NA}} +#' \item{pass}{Either `"Pass"` or `"Fail"`} +#' \item{.id}{Group identifier} +#' } +#' @examples +#' library(readr) +#' fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in <- data_in[data_in$`.id` == 1, ] +#' check_data_heaping(data = data_in, X = "Deaths") +#' +#' @export +#' +check_data_heaping <- function(data, X) { + + # Ensure '.id' column exists + if (!(".id" %in% names(data))) { + data$.id <- "all" + } + + # Perform checks + # Split data by '.id', apply checks, and combine results + split_data <- split(data, data$.id) + + check_one <- function(d) { + + x <- d[[X]] + checks <- c(check_numeric = !is.numeric(x), + check_missing = any(is.na(x)), + check_infinite = any(is.infinite(x)), + check_negative = any(x <= 0, na.rm = TRUE), + check_age = !("Age" %in% names(d)) + ) + + messages <- c( + check_numeric = sprintf("Column %s must be numeric", X), + check_missing = sprintf("Column %s should not contain missing values", X), + check_infinite = sprintf("Column %s should not contain infinite values", X), + check_nonpos = sprintf("All values in column %s must be > 0", X), + check_age = "Age column is required to check age heaping" + ) + + out <- data.frame( + check = names(checks), + message = ifelse(checks, messages, NA_character_)) + + out$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + return(out) + + } + + res <- + do.call(rbind, lapply(names(split_data), \(id) { + cbind(check_one(split_data[[id]]), .id = id) + })) + + return(res) + +} + +#' Validate input data for time graduation +#' @description +#' Performs structural and numerical validation checks for time-series data used in graduation or smoothing procedures. The function verifies that time points are numeric, unique, sequential, and evenly spaced, and that the associated values are numeric, finite, and strictly positive. +#' Validation is performed separately for each `.id` group. If `.id` is `NULL`, all observations are treated as belonging to a single group. +#' @param value Numeric vector of values to be graduated (e.g., life expectancy). +#' @param time Numeric vector of time points corresponding to `value` (e.g., calendar years). +#' @param .id Optional grouping variable. If provided, checks are performed separately by group. +#' @importFrom tibble tibble +#' @importFrom rlang %||% +#' @return +#' A data frame summarizing validation results. Contains: +#' \itemize{ +#' \item `check` – name of the validation performed +#' \item `message` – error message if the check fails, otherwise `NA` +#' \item `pass` – `"Pass"` or `"Fail"` +#' \item `.id` – group identifier +#' } +#' +#' @examples +#' check_data_graduate_time( +#' value = c(70.1, 70.4, 70.8), +#' time = c(2000, 2005, 2010) +#' ) +#' +#' @export +#' + +check_data_graduate_time <- function(value, time, .id = NULL) { + + data <- tibble( + value = value, + time = time, + .id = .id %||% "all" + ) + + split_data <- split(data, data$.id) + + check_one <- function(d) { + + value <- d$value + time <- d$time + + time_diff <- diff(sort(time)) + + checks <- c( + check_time_numeric = !is.numeric(time), + check_value_numeric = !is.numeric(value), + check_time_missing = any(is.na(time)), + check_value_missing = any(is.na(value)), + check_time_infinite = any(is.infinite(time)), + check_value_infinite = any(is.infinite(value)), + check_time_positive = any(time <= 0, na.rm = TRUE), + check_value_positive = any(value <= 0, na.rm = TRUE), + check_time_unique = length(time) != length(unique(time)), + check_time_sequential = !all(time == sort(time)), + check_time_regular = length(unique(time_diff)) > 1 + ) + + messages <- c( + check_time_numeric = "Time variable must be numeric", + check_value_numeric = "Value variable must be numeric", + check_time_missing = "Time variable contains missing values", + check_value_missing = "Value variable contains missing values", + check_time_infinite = "Time variable contains infinite values", + check_value_infinite = "Value variable contains infinite values", + check_time_positive = "Time values must be > 0", + check_value_positive = "Value values must be > 0", + check_time_unique = "Time values must be unique", + check_time_sequential = "Time values must be sequential and ordered", + check_time_regular = "Time intervals must be regular (equal spacing)" + ) + + out <- data.frame( + check = names(checks), + message = ifelse(checks, messages, NA_character_) + ) + + out$pass <- ifelse(!is.na(out$message), "Fail", "Pass") + + return(out) + } + + res <- do.call( + rbind, + lapply(names(split_data), function(id) { + cbind(check_one(split_data[[id]]), .id = id) + }) + ) + + return(res) +} + diff --git a/R/graduation.R b/R/graduation.R index bf09a76..bf44c9c 100644 --- a/R/graduation.R +++ b/R/graduation.R @@ -1,3 +1,323 @@ +#' Graduate grouped time data to single-year estimates +#' Expands grouped time data (e.g., 5-year intervals) into single-year values using various graduation methods. The function supports smoothing, monotone interpolation, uniform splitting, cubic regression splines, penalized composite link models (PCLM), and LOESS smoothing. +#' +#' Available methods: +#' \itemize{ +#' \item `"uniform"` — distributes totals evenly within each interval +#' \item `"mono"` — monotone cubic interpolation (Hyman filtered spline) +#' \item `"native"` — smoothing spline via \code{stats::smooth.spline} +#' \item `"cubic"` — cubic regression spline via \code{mgcv::gam} +#' \item `"loess"` — local polynomial smoothing via \code{stats::loess} +#' \item `"pclm"` — penalized composite link model via \code{ungroup::pclm} +#' } +#' +#' Block totals are preserved for all smoothing-based methods by rescaling +#' predictions within each interval to match original grouped totals. +#' @param data_in Data frame containing grouped time data. +#' @param method Graduation method. One of \code{"loess"}, \code{"native"}, \code{"cubic"}, \code{"mono"}, \code{"uniform"}, \code{"pclm"}. +#' @param X Name of the time variable (character string). +#' @param Y Name of the grouped value variable (character string). +#' @param timeInt Width of grouping interval (default = 5). +#' +#' @importFrom stats smooth.spline loess loess.control predict splinefun +#' @importFrom mgcv gam s +#' @importFrom ungroup pclm +#' @importFrom tibble rownames_to_column +#' @importFrom dplyr mutate +#' @importFrom magrittr %>% +#' @importFrom DemoTools int2age +#' @importFrom stats gaussian +#' @details +#' If column \code{.id} exists in \code{data_in}, graduation is performed separately within each subgroup. Otherwise all observations are treated as a single group. +#' For spline-based methods, predicted single-year values are rescaled +#' within each interval so that grouped totals are exactly preserved. +#' The PCLM method relies on the \pkg{ungroup} package and may internally rescale small counts to avoid negative fitted values. +#' +#' @return +#' A data frame with columns: +#' \itemize{ +#' \item \code{time} — single-year time values +#' \item \code{value} — graduated single-year estimates +#' \item \code{.id} — subgroup identifier +#' } +#' +#' @examples +#' df <- data.frame( +#' Year = seq(1950, 2015, by = 5), +#' Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, 560874.1, +#' 545608.3, 555335.9, 594584.6, 608425.3, 638167.3, +#' 655438.6, 689386.4, 740519.0, 804439.3) +#' ) +#' +#' graduate_time( +#' df, +#' method = "pclm", +#' X = "Year", +#' Y = "Deaths", +#' timeInt = 5 +#' ) +#' +#' @export +#' + +graduate_time <- function(data_in, + method = c("loess", "native", "cubic", "mono", "uniform", "pclm"), + X, + Y, + timeInt = 5) { + # uniform graduation + graduate_uniform_time <- function(Value, + time, + timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + if (is_single(time)) { + names(Value) <- time + return(Value) + } + + out <- rep(Value / timeInt, each = timeInt) + names(out) <- min(time):(min(time) + length(out) - 1) + + return(out) + + } + + # mono graduation + graduate_mono_time <- function (Value, time, timeInt = 5) { + + timeInt <- rep(timeInt, times = length(time)) + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + if (is_single(time)) { + names(Value) <- time + return(Value) + } + + timePred <- c(min(time), cumsum(timeInt) + min(time)) + y <- c(0, cumsum(Value)) + timeS <- min(time):(sum(timeInt) + min(time)) + y1 <- splinefun(y ~ timePred, method = "hyman")(timeS) + out <- diff(y1) + names(out) <- timeS[-length(timeS)] + time1 <- min(time):(min(time) + length(out) - 1) + names(out) <- time1 + return(out) + } + + # pclm routine using ungroup package + graduate_pclm_time <- function(Value, time, timeInt = 5) { + + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + timeInt <- rep(timeInt, times = length(time)) + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + ind0 <- Value == 0 + have0s <- any(ind0) + + if (have0s) { + cat("\n0s detected in Value, replacing with .01\n") + Value[ind0] <- 0.01 + } + + A <- pclm(x = time, + y = Value, + nlast = unique(timeInt)) + fac <- 1 + for(i in 1:3) { + + if(any(A$fitted < 0)) { + + fac <- 10 ^ i + A <- pclm(x = time, + y = Value * fac, + nlast = unique(timeInt)) + + } else { + + break + + } + } + + if(any(A$fitted < 0)) { + cat("\nCareful, results of PCLM produced some negatives. \n \nWe tried rescaling inputs by as much as", + fac, "\nbut alas it wasn't enough.\n") + } + if(fac > 1) { + cat("\nPossible small counts issue with these data and the PCLM method\nIt seems to have worked without producing negatives when fitting Value is scaled by", + fac, "\nCouldn't hurt to eyeball results!\n") + } + + B <- A$fitted / fac + a1.fitted <- A$bin.definition$output$breaks["left", ] + names(B) <- a1.fitted + return(B) + + } + + # cubic spline + graduate_cubic_time <- function(Value, time, timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + + # Smooth totals (shape only) + z2 <- gam( + Value ~ s(time, bs = "cr"), + family = gaussian(), + method = "REML" + ) + + pred <- predict( + z2, + newdata = data.frame(time = a1) + ) + + # Enforce totals blockwise + out <- pred + for(i in seq_along(Value)) { + idx <- ((i - 1) * unique(timeInt) + 1):(i * unique(timeInt)) + out[idx] <- out[idx] * Value[i] / sum(out[idx]) + } + + return(out) + } + + # native fraduation menthod (spline) + graduate_native_time <- function(Value, time, timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + z <- smooth.spline(time, Value, spar = 0.6) + + pred <- predict(z, a1)$y + + # Enforce totals blockwise + out <- pred + for (i in seq_along(Value)) { + idx <- ((i - 1) * unique(timeInt) + 1):(i * unique(timeInt)) + out[idx] <- out[idx] * Value[i] / sum(out[idx]) + } + + return(out) + } + + + # loess graduation + graduate_loess_time <- function(Value, time, timeInt = 5) { + + if (missing(time) & missing(timeInt)) { + time <- names2age(Value) + } + if (missing(timeInt)) { + timeInt <- age2int(time, OAG = FALSE) + } + if (missing(time)) { + time <- int2age(timeInt) + } + + a1 <- seq(from = min(time), to = c(max(time) + (unique(timeInt) - 1)), by = 1) + z1 <- loess(Value ~ time, + control = loess.control(surface = "direct")) + + pred <- predict(z1, newdata = data.frame(time = a1)) + + # rescale each 5-year block + out <- pred + for (i in seq_along(Value)) { + idx <- ((i - 1) * unique(timeInt) + 1):(i * unique(timeInt)) + out[idx] <- out[idx] * Value[i] / sum(out[idx]) + } + + return(out) + } + + method <- match.arg(method) + + # Ensure '.id' column exists + if (!(".id" %in% names(data_in))) { + data_in$.id <- "all" + } + + split_data <- split(data_in, data_in$.id) + + for_one <- function(d) { + + time <- d[[X]] + Value <- d[[Y]] + + res <- switch(method, + "native" = graduate_native_time( Value, time, timeInt = timeInt), + "loess" = graduate_loess_time( Value, time, timeInt = timeInt), + "cubic" = graduate_cubic_time( Value, time, timeInt = timeInt), + "mono" = graduate_mono_time( Value, time, timeInt = timeInt), + "uniform" = graduate_uniform_time(Value, time, timeInt = timeInt), + "pclm" = graduate_pclm_time( Value, time, timeInt = timeInt) + ) + + + res <- as.data.frame(res) %>% + rownames_to_column() %>% + set_names(c("time", "value")) %>% + mutate("value" = as.numeric(.data$value), + "time" = as.numeric(.data$time)) + + return(res) + + } + + out <- do.call(rbind, lapply(names(split_data), \(id) { + cbind(for_one(split_data[[id]]), .id = id) + })) + + return(out) + +} + #' @title `smooth_flexible` #' @description This is a wrapper around `smooth_flexible_chunk` that allows us to work with `.id` variable and create the output for corresponding groups. Smoothes population or death counts using various methods from the `DemoTools` package and paragraph five from "Method protocol for evaluating census population data by age and sex". #' @param data_in tibble or data.frame. A tibble with two numeric columns - population or death counts with supported names: `Pop`, `Population`, `Exp`, `Exposures` or `Deaths`, and corresponding numeric `Age` - provided in single age intervals, 5-year age intervals, or abridged age format e.g. with ages 0, 1, 5, etc. @@ -26,7 +346,7 @@ #' "single_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) +#' select(-1) #' ex1 <- smooth_flexible( #' data_in, #' variable = "Exposures", @@ -114,13 +434,13 @@ smooth_flexible <- function(data_in, # nms <- results |> - dplyr::select(all_of(c(".id", by_args))) |> + select(all_of(c(".id", by_args))) |> unite("one", everything(), sep = "_") |> pull(.data$one) smoothed_data <- results |> mutate(data = map(.data$result, ~ .x[["data"]]))|> - dplyr::select(-c(.data$result)) |> + select(-c(.data$result)) |> unnest(.data$data) figures <- results$result |> @@ -157,7 +477,7 @@ smooth_flexible <- function(data_in, #' "five_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) |> +#' select(-1) |> #' filter(.id == 1) #' #' ex1 <- graduate_auto(data_in, @@ -171,6 +491,8 @@ smooth_flexible <- function(data_in, #' ex1$arguments #' +# only can be called for counts and ages +# no need for rates and TIME graduate_auto <- function(data_in, age_out = c("single", "abridged", "5-year"), variable = NULL, @@ -543,8 +865,8 @@ graduate_auto <- function(data_in, # Check if we're converting from single ages to grouped ages # If so, normalize graduated values for fair visual comparison - input_is_single <- DemoTools::is_single(data_in$Age) - output_is_grouped <- !DemoTools::is_single(data_out$Age) && + input_is_single <- is_single(data_in$Age) + output_is_grouped <- !is_single(data_out$Age) && (age_out %in% c("5-year", "abridged")) if (input_is_single && output_is_grouped) { @@ -571,19 +893,19 @@ graduate_auto <- function(data_in, names(color_values) <- c(original_text, graduated_text) # Build plot using .data[[]] for runtime evaluation - plot_obj <- ggplot2::ggplot(plot_data, ggplot2::aes(x = .data[[age_text]], y = .data[[variable_text]], + plot_obj <- ggplot(plot_data, aes(x = .data[[age_text]], y = .data[[variable_text]], color = .data[[type_text]], linetype = .data[[type_text]])) + - ggplot2::geom_line(linewidth = 1) + - ggplot2::geom_point(data = plot_data[plot_data[[type_text]] == original_text, ], size = 2) + - ggplot2::scale_color_manual(name = type_text, values = color_values) + - ggplot2::scale_linetype_manual(name = type_text, values = c("solid", "solid")) + - ggplot2::labs(x = age_text, y = variable_text, + geom_line(linewidth = 1) + + geom_point(data = plot_data[plot_data[[type_text]] == original_text, ], size = 2) + + scale_color_manual(name = type_text, values = color_values) + + scale_linetype_manual(name = type_text, values = c("solid", "solid")) + + labs(x = age_text, y = variable_text, title = paste0(graduation_text, ": ", variable_text)) + - ggplot2::theme_minimal(base_size = 14) + - ggplot2::theme( + theme_minimal(base_size = 14) + + theme( legend.position = "bottom", - plot.title = ggplot2::element_text(face = "bold", hjust = 0.5) + plot.title = element_text(face = "bold", hjust = 0.5) ) cat(sprintf("[GRADUATION] Plot created successfully\n")) @@ -596,8 +918,8 @@ graduate_auto <- function(data_in, data_out_normalized <- data_out # Check if we're converting from single ages to grouped ages - input_is_single <- DemoTools::is_single(data_in$Age) - output_is_grouped <- !DemoTools::is_single(data_out$Age) && + input_is_single <- is_single(data_in$Age) + output_is_grouped <- !is_single(data_out$Age) && (age_out %in% c("5-year", "abridged")) if (input_is_single && output_is_grouped) { @@ -636,7 +958,7 @@ graduate_auto <- function(data_in, #' "five_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) |> +#' select(-1) |> #' filter(.id == 1) #' #' graduate_auto_5(data_in, "Exposures")$Exposures @@ -774,7 +1096,7 @@ graduate_auto_5 <- function(dat_5, #' "abridged_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath) |> -#' dplyr::select(-1) |> +#' select(-1) |> #' filter(.id == 1) #' #' ex1 <- smooth_flexible_chunk( @@ -791,6 +1113,17 @@ graduate_auto_5 <- function(dat_5, #' ex1$figure$figure #' ex1$arguments + +# if the input is time then user provides vector 2011:2020 +# another copy for graduate_time +# graduate time with pclm and mono, graduate uniform +# "pclm", "mono", "uniform" +# do not demand age, time can be various intervals +# times should be unique, and sequential and make seance, and coherent 2005:2010, 2008:2011 +# no overlap (unique) +# use the logic of age coherent +# use teh same plotting logic + smooth_flexible_chunk <- function(data_in, variable = "Deaths", age_out = c("single", "abridged", "5-year"), @@ -1348,8 +1681,8 @@ smooth_flexible_chunk <- function(data_in, #' "abridged_hmd_spain.csv.gz", #' package = "ODAPbackend") #' data_in <- read_csv(fpath, show_col_types = FALSE) |> -#' dplyr::select(-1) |> -#' dplyr::filter(.id == 1) +#' select(-1) |> +#' filter(.id == 1) #' #' data_out <- smooth_flexible_chunk( #' data_in, diff --git a/R/input_diagnostics.R b/R/input_diagnostics.R index 7cc4a0b..e229c1c 100644 --- a/R/input_diagnostics.R +++ b/R/input_diagnostics.R @@ -1,24 +1,3 @@ -#' @title `check_heaping_general` -#' @description Check the age heaping for 5 or 1 year data. -#' @param data data.frame. User file from the read_data command with the minimum data on `Exposures`, `Death` and `Age`. Data can be both in 5 and 1 year age intervals -#' @param y character.Variable name for which the heaping should be checked `Deaths` or `Exposures`. -#' @return A data.frame with 3 columns `method` - the method used for age heaping evaluation, `result` - the resulting heaping measure, and `.id` - user specified groups identifier (if missing is set to "all") -#' @importFrom tibble tibble -#' @importFrom tidyr unnest pivot_longer -#' @importFrom tidyselect all_of -#' @importFrom stringr str_detect -#' @importFrom dplyr bind_rows left_join select group_nest mutate pull group_by -#' @importFrom rlang .data -#' @importFrom purrr map map2 -#' @importFrom DemoTools is_single check_heaping_roughness check_heaping_bachi check_heaping_myers check_heaping_sawtooth -#' @export -#' @examples -#' \dontrun{ -#' check_heaping_general( -#' data = data, -#' y = "Exposures") -#' } -#' # check_heaping_general <- function(data, y) { # @@ -81,115 +60,278 @@ # return(out) # # } -check_heaping_general <- function(data, y) { - - if(!(".id" %in% colnames(data))) { - data <- data |> - mutate(.id = "all") - } - - labels <- c( - "Highly accurate", - "Fairly accurate", - "Approximate", - "Rough", - "Very rough" - ) + +# add argument rate or proportion, units +# and add log, sqrt, logit +# Scale +# check_heaping_general <- function(data, y) { +# +# if(!(".id" %in% colnames(data))) { +# data <- data |> +# mutate(.id = "all") +# } +# +# labels <- c( +# "Highly accurate", +# "Fairly accurate", +# "Approximate", +# "Rough", +# "Very rough" +# ) +# +# tbl <- tibble("level" = labels, +# color = c("#ffffb2", +# "#fecc5c", +# "#fd8d3c", +# "#f03b20", +# "#bd0026")) +# +# has_single <- is_single(unique(data$Age)) +# +# if(has_single) { +# +# res <- data |> +# select("Age", all_of(y), ".id") |> +# group_nest(.data$`.id`) |> +# mutate("bachi" = map(data, ~ +# check_heaping_bachi( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age, +# OAG = TRUE, +# method = "pasex")), +# "myers" = map(data, ~ +# check_heaping_myers( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age)), +# "age scale" = "single") |> +# select(".id", "bachi", "myers", "age scale") |> +# unnest(c("bachi", "myers")) |> +# pivot_longer(-c(".id", "age scale"), +# names_to = "method", +# values_to = "result") |> +# group_by(.data$`.id`) |> +# mutate("level" = cut(.data$result, +# breaks = c(0, 1, 2, 5, 15, 80), +# labels = labels), +# "level" = as.character(.data$level)) |> +# left_join(tbl, by = c("level")) +# +# } +# +# # 5-year methods +# classify <- function(x, method, breaks, labels) { +# tibble( +# result = x, +# method = method, +# level = cut(x, breaks = breaks, labels = labels) +# ) +# } +# +# res5 <- data |> +# select("Age", all_of(y), ".id") |> +# group_nest(.data$`.id`) |> +# mutate( +# "roughness" = map(data, ~ +# check_heaping_roughness( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age, +# ageMin = 30)), +# "sawtooth" = map(data, ~ +# check_heaping_sawtooth( +# Value = pull(.x, !!sym(y)), +# Age = .x$Age, +# ageMin = 30)), +# "res" = map2(.x = .data$roughness, +# .y = .data$sawtooth, ~ +# bind_rows( +# classify( +# x = .x, +# method = "roughness", +# breaks = c(0, .1, .2, .5, 1.5, 10), +# labels = labels), +# classify( +# x = .y, +# method = "sawtooth", +# breaks = c(1, 1.03, 1.1, 1.5, 3, 10), +# labels = labels))), +# "age scale" = "5-year") |> +# select(-c("data", "roughness", "sawtooth")) |> +# unnest("res") |> +# left_join(tbl, by = join_by("level")) +# +# if(has_single) { +# +# out <- bind_rows(res, res5) +# +# } else { +# +# out <- res5 +# +# } +# +# return(out) +# +# } + + +#' General age-heaping diagnostics +#' @description +#' Computes age-heaping and age-irregularity indices from age-specific counts, rates, or proportions. The set of diagnostics depends on the age scale: digit-preference indices are used for single-year ages, while irregularity indices are used for grouped ages. +#' @param data A data frame containing at least `Age` and the variable `y`. If `.id` is present, diagnostics are computed by group; otherwise all observations are treated as one group. +#' @param y Character string giving the name of the age-specific variable. +#' @param units Type of input data. One of `"count"`, `"rate"`, or `"proportion"`. +#' @importFrom tibble tibble +#' @importFrom dplyr full_join mutate left_join join_by bind_rows +#' @importFrom DemoTools check_heaping_bachi check_heaping_myers check_heaping_roughness check_heaping_sawtooth +#' @details +#' * Single-year ages: Bachi and Myers digit-preference indices. +#' * Grouped ages: roughness and sawtooth irregularity indices. +#' For `"count"` data, values are internally converted to proportions. +#' For `"rate"` and `"proportion"` data, values are normalized to sum to one +#' to ensure comparability across ages. +#' @return +#' A data frame with one row per index and group, containing: +#' \itemize{ +#' \item `method` – diagnostic index name +#' \item `result` – numeric value +#' \item `level` – qualitative data-quality category +#' \item `color` – associated color code +#' \item `age scale` – `"single"` or `"5-year"` +#' \item `.id` – group identifier +#' } +#' @seealso +#' \code{\link{check_heaping_bachi}}, +#' \code{\link{check_heaping_myers}}, +#' \code{\link{check_heaping_roughness}}, +#' \code{\link{check_heaping_sawtooth}} +#' @references +#' Bachi (1951); Myers (1940); United Nations (1955). +#' @examples +#' library(readr) +#' fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in$nMx <- data_in$Deaths / data_in$Exposures +#' data_in <- data_in[data_in$.id == 1, ] +#' check_heaping_general(data = data_in, y = "Deaths", units = "count") +#' check_heaping_general(data = data_in, y = "nMx", units = "rate") +#' @export +#' + +check_heaping_general <- function(data, y, units = c("count", "rate", "proportion")) { - tbl <- tibble("level" = labels, - color = c("#ffffb2", - "#fecc5c", - "#fd8d3c", - "#f03b20", - "#bd0026")) + units <- match.arg(units) - has_single <- is_single(unique(data$Age)) + stopifnot(is.character(y), length(y) == 1) + stopifnot(all(c("Age", y) %in% names(data))) - if(has_single) { - - res <- data |> - select("Age", all_of(y), ".id") |> - group_nest(.data$`.id`) |> - mutate("bachi" = map(data, ~ - check_heaping_bachi( - Value = pull(.x, !!sym(y)), - Age = .x$Age, - OAG = TRUE, - method = "pasex")), - "myers" = map(data, ~ - check_heaping_myers( - Value = pull(.x, !!sym(y)), - Age = .x$Age)), - "age scale" = "single") |> - select(".id", "bachi", "myers", "age scale") |> - unnest(c("bachi", "myers")) |> - pivot_longer(-c(".id", "age scale"), - names_to = "method", - values_to = "result") |> - group_by(.data$`.id`) |> - mutate("level" = cut(.data$result, - breaks = c(0, 1, 2, 5, 15, 80), - labels = labels), - "level" = as.character(.data$level)) |> - left_join(tbl, by = c("level")) - + # Ensure '.id' column exists + if (!(".id" %in% names(data))) { + data$.id <- "all" } - # 5-year methods - classify <- function(x, method, breaks, labels) { - tibble( - result = x, - method = method, - level = cut(x, breaks = breaks, labels = labels) - ) - } + # keep only columns we need + data <- data |> + select(c('.id', "Age", all_of(y))) - res5 <- data |> - select("Age", all_of(y), ".id") |> - group_nest(.data$`.id`) |> - mutate( - "roughness" = map(data, ~ - check_heaping_roughness( - Value = pull(.x, !!sym(y)), - Age = .x$Age, - ageMin = 30)), - "sawtooth" = map(data, ~ - check_heaping_sawtooth( - Value = pull(.x, !!sym(y)), - Age = .x$Age, - ageMin = 30)), - "res" = map2(.x = .data$roughness, - .y = .data$sawtooth, ~ - bind_rows( - classify( - x = .x, - method = "roughness", - breaks = c(0, .1, .2, .5, 1.5, 10), - labels = labels), - classify( - x = .y, - method = "sawtooth", - breaks = c(1, 1.03, 1.1, 1.5, 3, 10), - labels = labels))), - "age scale" = "5-year") |> - select(-c("data", "roughness", "sawtooth")) |> - unnest("res") |> - left_join(tbl, by = join_by("level")) + # Perform checks + # Split data by '.id', apply checks, and combine results + split_data <- split(data, data$.id) + labels <- c("Highly accurate", "Fairly accurate", "Approximate", "Rough", "Very rough") + color <- c("#ffffb2", "#fecc5c", "#fd8d3c", "#f03b20", "#bd0026") + tbl <- tibble("level" = labels, + "color" = color) - if(has_single) { + check_one <- function(d = split_data) { - out <- bind_rows(res, res5) + Y <- d[[y]] + Age <- d$Age + has_single <- is_single(Age) - } else { + if (units != "count") { + + Y <- Y / sum(Y) + + } - out <- res5 + roughness <- check_heaping_roughness( + Value = Y, + Age = Age, + ageMin = 30) + + sawtooth <- check_heaping_sawtooth( + Value = Y, + Age = Age, + ageMin = 30) + + roughness <- tibble("method" = "roughness", + "result" = roughness) |> + mutate("level" = cut(.data$result, + breaks = c(0, .1, .2, .5, 1.5, 10), + labels = labels), + "level" = as.character(.data$level)) + + sawtooth <- tibble("method" = "sawtooth", + "result" = sawtooth) |> + mutate("level" = cut(.data$result, + breaks = c(1, 1.03, 1.1, 1.5, 3, 10), + labels = labels), + "level" = as.character(.data$level)) + + out5 <- roughness |> + full_join(sawtooth, by = join_by("method", "result", "level")) |> + mutate("age scale" = "5-year") |> + left_join(tbl, by = c("level")) + + if(has_single) { + + bachi <- check_heaping_bachi( + Value = Y, + Age = Age, + OAG = TRUE, + method = "pasex") + + myers <- check_heaping_myers( + Value = Y, + Age = Age) + + out <- tibble("method" = c("bachi", "myers"), + "result" = c(bachi, myers), + "age scale" = "single") |> + mutate("level" = cut(.data$result, + breaks = c(0, 1, 2, 5, 15, 80), + labels = labels), + "level" = as.character(.data$level)) |> + left_join(tbl, by = c("level")) + + out <- bind_rows(out, out5) + + } else { + + out <- out5 + + } + + if(units %in% c("rate", "proportion")) { + + out <- out[!out$method == "roughness", ] + + } + + return(out) } - return(out) + res <- + do.call(rbind, lapply(names(split_data), \(id) { + cbind(check_one(split_data[[id]]), .id = id) + })) + + return(res) } + + #' @title `check_heaping_user` #' @description Check the age heaping for 5 or 1 year data, but this time give user control over minimum and maximum evaluation age. #' @param data data.frame. User file from the read_data command with the minimum data on Exposures, Death and Age. Data ca be both in 5 and 1 year age intervals diff --git a/R/odap.R b/R/odap.R index 26fd8ea..a1561ed 100644 --- a/R/odap.R +++ b/R/odap.R @@ -26,7 +26,6 @@ #' @importFrom rlang .data %||% sym #' @importFrom magrittr %>% #' @importFrom utils installed.packages data globalVariables -#' @export #' #' @details #' The function: @@ -47,10 +46,7 @@ #' \item{\code{figures}}{A named list of \code{ggplot2} objects showing original vs. redistributed populations.} #' } #' -#' @seealso \code{\link[DemoTools]{OPAG}}, \code{\link{link[DemoTools]{lt_single_mx}}, \code{\link{\link[DemoTools]{groupAges}} -#' #' @examples -#' \dontrun{ #' library(dplyr) #' #' data_in <- tibble( @@ -69,7 +65,9 @@ #' #' # Plot original vs redistributed population #' print(res$figures$India) -#' } +#' @export +#' + odap_opag <- function(data_in = NULL, Age_fit = c(60, 70), AgeInt_fit = c(10, 10), @@ -92,8 +90,8 @@ odap_opag <- function(data_in = NULL, method <- match.arg(method, c("mono", "pclm", "uniform")) - # Helper: conditional filtering for user defined variables - # e.g. if sex exits and provided by user we use it in filtering + # conditional filtering for user defined variables + # eg if sex exits and provided by user we use it in filtering conditional_filter <- function(df, col, values) { if(!is.null(values) && col %in% names(df)) { @@ -207,7 +205,7 @@ odap_opag <- function(data_in = NULL, nLx <- nLx %>% group_by(across(any_of(c("name", "country_code", "sex", "year")))) %>% reframe( - lt_single_mx(nMx = .data$mx, + lt_single_mx(nMx = .data$mx, Age = .data$age), .groups = "drop") %>% select("name", "country_code", diff --git a/R/smoothing1d.R b/R/smoothing1d.R index 7b317a6..d89a0c4 100644 --- a/R/smoothing1d.R +++ b/R/smoothing1d.R @@ -1,315 +1,283 @@ -#' @title `smooth1d_chunk` -#' @description Smooth a univariate time series, optionally using `weights.` Choose between the super-smoother (`"supsmu"`) method, loess (`"lowess"` or `"loess"`) , smoothing splines (`"cubicsplines"`), thin-plate splines (`"gam-tp"`), or p-splines (`"gam-ps"`). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (`xout`). -#' @details `"supsmu"` method takes a `smoothing_par` between 0 and 10. `"lowess"` and -#' @param data_in. `data.frame` with x and y coordinates to fit to. Optionally with `weights`. Should refer to a single subset of data. -#' @param method character. Smoothing method desired. options `"supsmu"` (default),`"lowess"`,`"loess"`,`"cubicspline"`,`"gam-tp"`,`"gam-ps"` -#' @param smoothing_par smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1 -#' @param xout numeric vector of coordinates to predict for. Defaults to original unique coordinates. -#' @param xname name of variable containing x coordinate, default `x` -#' @param yname name of variable containing y coordinate, default `y` -#' @importFrom stats supsmu smooth.spline loess predict loess.control -#' @importFrom dplyr case_when +#' One–Dimensional Smoothing Wrapper for Age or Time Series +#' \code{smooth1d()} provides a unified interface for smoothing one–dimensional demographic or time–series data (e.g., age schedules or time trends). +#' The function supports multiple smoothing engines including \code{supsmu}, \code{lowess}, \code{loess}, cubic splines, and GAM-based methods. +#' Optional transformations (log, sqrt, logit) can be applied prior to smoothing. +#' The function automatically handles grouped data via a \code{.id} column. +#' @param data_in A data.frame or tibble containing the input data. +#' @param units Character string. One of: +#' \itemize{ +#' \item \code{"count"} +#' \item \code{"rate"} +#' \item \code{"proportion"} +#' } +#' @param X Character string. Name of the independent variable (typically \code{"Age"} or \code{"Time"}). +#' @param Y Character string. Name of the dependent variable. +#' @param scale Optional transformation applied before smoothing: +#' \itemize{ +#' \item \code{"log"} +#' \item \code{"sqrt"} +#' \item \code{"logit"} +#' \item \code{"none"} +#' } +#' @param method Smoothing engine: +#' \itemize{ +#' \item \code{"supsmu"} +#' \item \code{"lowess"} +#' \item \code{"loess"} +#' \item \code{"cubicspline"} +#' \item \code{"gam-tp"} (thin plate regression spline) +#' \item \code{"gam-ps"} (P-spline) +#' } +#' @param smoothing_par Numeric smoothing parameter. Interpretation depends on method. +#' @param xout Optional vector of evaluation points. Defaults to unique values of \code{X}. +#' @importFrom dplyr mutate group_split bind_rows case_when +#' @importFrom dplyr as_tibble +#' @importFrom purrr map +#' @importFrom rlang sym +#' @importFrom tibble tibble +#' @importFrom ggplot2 ggplot geom_point geom_line theme theme_bw +#' @importFrom ggplot2 element_text scale_x_continuous scale_y_continuous +#' @importFrom ggplot2 scale_color_brewer labs +#' @importFrom scales pretty_breaks +#' @importFrom stats supsmu loess smooth.spline predict qlogis +#' @importFrom mgcv gam s #' @importFrom signal interp1 -#' @importFrom mgcv gam +#' @importFrom magrittr %>% +#' @importFrom forcats as_factor +#' @return A named list with: +#' \itemize{ +#' \item \code{result} — Tibble containing smoothed values +#' \item \code{plot} — ggplot object comparing raw and smoothed values +#' } +#' +#' @details +#' If the input data does not contain a \code{.id} column, a default group \code{"all"} is created. +#' Weights default to 1 if not supplied. +#' Transformations are applied before smoothing but not automatically back-transformed. +#' +#' @examples +#' # Example 1: Age smoothing of mortality rates (log scale) +#' library(readr) +#' +#' fpath <- system.file( +#' "extdata", +#' "single_hmd_spain.csv.gz", +#' package = "ODAPbackend" +#' ) +#' +#' data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +#' data_in$nMx <- data_in$Deaths / data_in$Exposures +#' data_in <- data_in[data_in$.id == 1, ] +#' names(data_in) <- c(".id", "Time", "Age", "Sex", +#' "Deaths", "Exposures", "nMx") +#' +#' z <- smooth1d( +#' data_in = data_in, +#' units = "rate", +#' X = "Age", +#' Y = "nMx", +#' method = "supsmu", +#' scale = "log" +#' ) +#' +#' z$result +#' z$plot +#' +#' # Example 2: Time smoothing of counts +#' df <- data.frame( +#' Time = seq(1950, 2015, by = 5), +#' Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, +#' 560874.1, 545608.3, 555335.9, 594584.6, +#' 608425.3, 638167.3, 655438.6, 689386.4, +#' 740519.0, 804439.3) +#' ) +#' +#' z <- smooth1d( +#' data_in = df, +#' units = "count", +#' X = "Time", +#' Y = "Deaths", +#' method = "supsmu", +#' scale = "none" +#' ) +#' +#' z$result +#' z$plot +#' #' @export #' -smooth1d_chunk <- function(data_in., - method = c("supsmu", "lowess", "loess", - "cubicspline", "gam-tp", "gam-ps")[1], +smooth1d <- function(data_in, + units = c("count", "rate", "proportion"), + X = c("Age", "Time"), + Y = c("Exposures"), + scale = c("log", "sqrt", "logit", "none"), + method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps"), smoothing_par = 1, - xout = data_in.[["x"]], - xname = "x", - yname = "y"){ + xout = unique(data_in[[X]])) { - x <- data_in.[[xname]] - y <- data_in.[[yname]] - w <- data_in.[["weights"]] + units <- match.arg(units) + scale <- match.arg(scale) + X <- match.arg(X) - if (method == "supsmu"){ - smoothing_par <- case_when(smoothing_par < 0 ~ 0, - smoothing_par > 10 ~ 10, - TRUE ~ smoothing_par) - - fit <- supsmu(x = x, - y = y, - wt = w, - span = "cv", - bass = smoothing_par) - # ---------------------------------------------------------------# - # TR: would be bettwe to follow this function with interpolate() # - # instead of fixing this option? # - # Then again maybe keep it cuz most methods have predict()? # - # ---------------------------------------------------------------# - pred <- interp1(fit$x, - fit$y, - xi = xout, - method = 'pchip', - extrap = TRUE) - data_out <- tibble(!!xname := xout, !!yname := pred) - } - if (method == "lowess"){ - fit <- loess(y ~ x, - weights = w, - span = smoothing_par, - degree = 1, - control = loess.control(surface = "direct")) - pred <- predict(fit, newdata = data.frame(x = xout)) - data_out <- tibble(!!xname := xout, !!yname := pred) + # Ensure '.id' column exists + if (!(".id" %in% names(data_in))) { + data_in$.id <- "all" } - if (method == "loess"){ - fit <- loess(y ~ x, - weights = w, - span = smoothing_par, - degree = 2, - control = loess.control(surface = "direct")) - pred <- predict(fit, newdata = data.frame(x = xout)) - data_out <- tibble(!!xname := xout, !!yname := pred) + if (!"weights" %in% colnames(data_in)){ + data_in <- data_in |> + mutate(weights = 1) } - if (method == "cubicspline") { - if (is.numeric(smoothing_par)){ - fit <- smooth.spline(x = x, - y = y, - w = w, - spar = smoothing_par) - } else { - fit <- smooth.spline(x = x, - y = y, - w = w, - cv = FALSE) - } - - pred <- predict(fit, x = data.frame(x = xout)) - data_out <- data.frame(pred) - colnames(data_out) <- c(xname, yname) - } + # here transformation occurs + # we give warning if it is strange - # covers gam-tp and gam-ps - if (grepl(method, pattern = "gam")) { - smoothing_par <- ifelse(is.numeric(smoothing_par), smoothing_par, 1) - lil_data <- tibble(x = x, y = y, w = w) - bs <- gsub(method, pattern = "gam-", replacement = "") - fit <- mgcv::gam(y ~ + s(x, m = smoothing_par), - bs = bs, - data = lil_data, - weights = w, - method = "REML", - select = TRUE, - family = "gaussian") - pred <- predict(fit, newdata = data.frame(x = xout)) - data_out <- tibble(!!xname := xout, !!yname := pred) + if(units == "counts" & scale != "none") { + + warning("You are applying transformation to counts which is strange") + } - return(data_out) + split_data <- data_in %>% + mutate( + {{ Y }} := { + + y <- !!sym(Y) + + switch(scale, + log = log(y), + sqrt = sqrt(y), + logit = qlogis(y), + none = y + ) + }, .by = ".id" + ) %>% + group_split(.data$`.id`) -} - -#' @title `smooth1d` -#' @description Smooth a univariate time series, optionally using `weights.` Choose between the super-smoother (`"supsmu"`) method, loess (`"lowess"` or `"loess"`) , smoothing splines (`"cubicsplines"`), thin-plate splines (`"gam-tp"`), or p-splines (`"gam-ps"`). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (`xout`). If grouping variables have been declared, then we use the same parameters for each subset. -#' @details `"supsmu"` method takes a `smoothing_par` between 0 and 10. `"lowess"` and -#' @param data_in `data.frame` with x and y coordinates to fit to. Optionally with `weights` -#' @param method character. Smoothing method desired. options `"supsmu"` (default),`"lowess"`,`"loess"`,`"cubicspline"`,`"gam-tp"`,`"gam-ps"` -#' @param smoothing_par smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1 -#' @param xout numeric vector of coordinates to predict for. Defaults to original unique coordinates. -#' @param xname name of variable containing x coordinate, default `x` -#' @param yname name of variable containing y coordinate, default `y` -#' @importFrom dplyr mutate filter reframe bind_rows ungroup .data -#' @importFrom purrr map_lgl -#' @importFrom tidyselect all_of -#' @importFrom tidyr unnest -#' @importFrom rlang set_names .data -#' @export -#' @examples -#' library(tibble) -#' x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, -#' 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, -#' 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, -#' 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, -#' 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, -#' 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, -#' 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, -#' 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, -#' 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, -#' 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, -#' 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, -#' 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, -#' 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, -#' 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, -#' 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, -#' 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) -#' -#' y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, -#' 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, -#' 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, -#' 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, -#' 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, -#' 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, -#' 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, -#' 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, -#' 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, -#' 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, -#' 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, -#' 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, -#' 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, -#' 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, -#' 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, -#' 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, -#' 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, -#' 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, -#' 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, -#' 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, -#' 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, -#' 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, -#' 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, -#' 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, -#' 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, -#' 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, -#' 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, -#' 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, -#' 1.56845271587372, 1.37076985836029) -#' data_in <- tibble(x=x,y=y,w=1) -#' xout <- 1950:2018 + .5 -#' \dontrun{ -#' plot(data_in$x, data_in$y) -#' lines(xout, smooth1d(data_in, method = "supsmu", -#' xout = xout, smoothing_par = .2)$y, -#' col = "red") -#' lines(xout, smooth1d(data_in, method = "loess", -#' xout = xout, smoothing_par = .2)$y, -#' col = "blue", lty = "82") -#' lines(xout, smooth1d(data_in, method = "gam-ps", xout = xout)$y, -#' col = "forestgreen" ,lty="82") -#' lines(xout, smooth1d(data_in, method = "cubicspline", -#' xout = xout, smoothing_par =1)$y, -#' col = "purple", lty="22") -#' } -#' - -smooth1d <- function(data_in, - method = c("supsmu", "lowess", "loess", - "cubicspline", "gam-tp", "gam-ps")[1], - smoothing_par = 1, - xout = data_in[["x"]] |> unique(), - xname = "x", - yname = "y"){ - if (!(".id" %in% colnames(data_in))) { - data_in <- data_in |> - mutate(.id = "all") - } - if (!"weights" %in% colnames(data_in)){ - data_in <- data_in |> - mutate(weights = 1) + for_one <- function(data) { + + x <- data[[X]] + y <- data[[Y]] + w <- data[["weights"]] + + if (method == "supsmu") { + + smoothing_par <- case_when(smoothing_par < 0 ~ 0, + smoothing_par > 10 ~ 10, + TRUE ~ smoothing_par) + + fit <- supsmu(x = x, + y = y, + wt = w, + span = "cv", + bass = smoothing_par) + # ---------------------------------------------------------------# + # TR: would be better to follow this function with interpolate() # + # instead of fixing this option? # + # Then again maybe keep it cuz most methods have predict()? # + # ---------------------------------------------------------------# + pred <- interp1(fit$x, + fit$y, + xi = xout, + method = 'pchip', + extrap = TRUE) + data_out <- tibble(!!X := xout, !!Y := pred) + } + + if (method %in% c("lowess", "loess")) { + + degree_val <- ifelse(method == "lowess", 1, 2) + + fit <- loess(y ~ x, + weights = w, + span = smoothing_par, + degree = degree_val, + control = loess.control(surface = "direct")) + pred <- predict(fit, newdata = data.frame(x = xout)) + data_out <- tibble(!!X := xout, !!Y := pred) + } + + if (method == "cubicspline") { + if (is.numeric(smoothing_par)){ + fit <- smooth.spline(x = x, + y = y, + w = w, + spar = smoothing_par) + } else { + fit <- smooth.spline(x = x, + y = y, + w = w, + cv = FALSE) + } + + pred <- predict(fit, x = data.frame(x = xout)) + data_out <- data.frame(pred) + names(data_out) <- c(X, Y) + data_out <- as_tibble(data_out) + } + + # covers gam-tp and gam-ps + if (grepl(method, pattern = "gam")) { + + smoothing_par <- ifelse(is.numeric(smoothing_par), smoothing_par, 1) + lil_data <- tibble(x = x, y = y, w = w) + bs <- gsub(method, pattern = "gam-", replacement = "") + fit <- gam(y ~ + s(x, m = smoothing_par), + bs = bs, + data = lil_data, + weights = w, + method = "REML", + select = TRUE, + family = "gaussian") + pred <- predict(fit, newdata = data.frame(x = xout)) + data_out <- tibble(!!X := xout, !!Y := pred) + } + + return(data_out) } - data_out <- data_in |> - ungroup() |> - reframe( - smooth1d_chunk(data_in. = .data, - method = method, - smoothing_par = smoothing_par, - xout = xout, - xname = xname, - yname = yname), - .by = all_of(c(".id")) - ) #|> - #set_names(c(".id", by_args, "data")) + res <- split_data %>% + map(~ for_one(.x)) %>% + bind_rows(.id = ".id") - # data_out <- data_out |> - # filter(map_lgl(data, is.data.frame))|> - # unnest(.data$data) - return(data_out) + compare <- split_data %>% + bind_rows(.id = ".id") -} - -#' @title `check_smooth` -#' @description Creates a plot of original data as points and smooth result as line for each `.id`. -#' @param data_out a data.frame or tibble. The data.frame output of the `interpolate` function. -#' @param data_in a data.frame or tibble. The original data provided by user -#' @return A figure of original data as points and smooth result as line for each `.id` -#' @importFrom dplyr mutate select summarise group_by -#' @importFrom scales pretty_breaks -#' @importFrom ggplot2 ggplot geom_point geom_line theme_minimal aes theme element_text scale_x_continuous scale_y_continuous -#' @export -#' @examples -#' library(tibble) -#' x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, -#' 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, -#' 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, -#' 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, -#' 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, -#' 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, -#' 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, -#' 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, -#' 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, -#' 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, -#' 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, -#' 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, -#' 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, -#' 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, -#' 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, -#' 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) -#' -#' y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, -#' 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, -#' 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, -#' 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, -#' 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, -#' 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, -#' 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, -#' 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, -#' 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, -#' 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, -#' 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, -#' 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, -#' 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, -#' 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, -#' 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, -#' 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, -#' 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, -#' 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, -#' 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, -#' 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, -#' 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, -#' 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, -#' 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, -#' 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, -#' 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, -#' 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, -#' 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, -#' 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, -#' 1.56845271587372, 1.37076985836029) -#' data_in <- tibble(x = x, -#' y = y, -#' w = 1) -#' xout <- 1950:2018 + .5 -#' -#' data_out <- smooth1d(data_in, method = "gam-ps", xout = xout) -#' -#' check_smooth(data_in, data_out) -#' - -check_smooth <- function(data_in, - data_out) { + compare$.id <- as_factor(compare$.id) + res$.id <- as_factor(res$.id) - if (!(".id" %in% colnames(data_in))) { - data_in <- data_in |> - mutate(.id = "all") - } + fig <- ggplot() + + geom_point( + aes(x = pull(compare, !!sym("X")), + y = pull(compare, !!sym("Y")), + color = pull(compare, ".id") + )) + + geom_line( + aes(x = pull(res, !!sym("X")), + y = pull(res, !!sym("Y")), + color = pull(res, ".id") + )) + + theme( + legend.position = "bottom", + legend.title = element_text(face = "bold"), + axis.text = element_text(color = "black") + ) + + scale_x_continuous(breaks = pretty_breaks()) + + scale_y_continuous(breaks = pretty_breaks()) + + labs( + x = NULL, + y = NULL + ) + + scale_color_brewer( + palette = "Set1", + name = "Group" + ) + + theme_bw() + + return(list(result = res, + plot = fig)) - ggplot() + - geom_point(data = data_in, aes(x = .data$x, - y = .data$y, - color = .data$.id)) + - geom_line(data = data_out, aes(x = .data$x, - y = .data$y, - color = .data$.id)) + - theme_minimal() + - theme(legend.position = "bottom", - axis.text = element_text(color = "black")) + - scale_x_continuous(breaks = pretty_breaks()) + - scale_y_continuous(breaks = pretty_breaks()) -} +} \ No newline at end of file diff --git a/man/check_data_graduate_time.Rd b/man/check_data_graduate_time.Rd new file mode 100644 index 0000000..9fedfe5 --- /dev/null +++ b/man/check_data_graduate_time.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_graduate_time} +\alias{check_data_graduate_time} +\title{Validate input data for time graduation} +\usage{ +check_data_graduate_time(value, time, .id = NULL) +} +\arguments{ +\item{value}{Numeric vector of values to be graduated (e.g., life expectancy).} + +\item{time}{Numeric vector of time points corresponding to \code{value} (e.g., calendar years).} + +\item{.id}{Optional grouping variable. If provided, checks are performed separately by group.} +} +\value{ +A data frame summarizing validation results. Contains: +\itemize{ +\item \code{check} – name of the validation performed +\item \code{message} – error message if the check fails, otherwise \code{NA} +\item \code{pass} – \code{"Pass"} or \code{"Fail"} +\item \code{.id} – group identifier +} +} +\description{ +Performs structural and numerical validation checks for time-series data used in graduation or smoothing procedures. The function verifies that time points are numeric, unique, sequential, and evenly spaced, and that the associated values are numeric, finite, and strictly positive. +Validation is performed separately for each \code{.id} group. If \code{.id} is \code{NULL}, all observations are treated as belonging to a single group. +} +\examples{ +check_data_graduate_time( + value = c(70.1, 70.4, 70.8), + time = c(2000, 2005, 2010) +) + +} diff --git a/man/check_data_heaping.Rd b/man/check_data_heaping.Rd new file mode 100644 index 0000000..58b85f9 --- /dev/null +++ b/man/check_data_heaping.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_heaping} +\alias{check_data_heaping} +\title{Check consistency of data for age heaping diagnostics} +\usage{ +check_data_heaping(data, X) +} +\arguments{ +\item{data}{A data frame containing the variable to be evaluated. +Optionally may contain a grouping column \code{.id}.} + +\item{X}{A character string specifying the name of the variable to be checked (e.g., \code{"Deaths"}).} +} +\value{ +A data frame summarizing validation results for each group. +The returned object contains: +\describe{ +\item{check}{Name of the validation performed} +\item{message}{Error message if the check fails, otherwise \code{NA}} +\item{pass}{Either \code{"Pass"} or \code{"Fail"}} +\item{.id}{Group identifier} +} +} +\description{ +Validates a data frame prior to performing age heaping analysis. +The function checks whether a specified variable contains valid numeric values and ensures that required structural conditions are satisfied. +If a grouping variable \code{.id} is not present, one is automatically created with a single level \code{"all"}. +Validation is performed separately for each \code{.id} group. +} +\details{ +Validate input data for age heaping analysis + +For the selected variable \code{X}, the following checks are performed: +\itemize{ +\item Numeric type validation +\item Missing value detection +\item Infinite value detection +\item Strict positivity constraint (all values must be > 0) +\item Presence of an \code{Age} column (required for age heaping analysis) +} +Each validation is conducted within levels of \code{.id}. +} +\examples{ +library(readr) +fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in <- data_in[data_in$`.id` == 1, ] +check_data_heaping(data = data_in, X = "Deaths") + +} diff --git a/man/check_data_lifetable.Rd b/man/check_data_lifetable.Rd new file mode 100644 index 0000000..277beb4 --- /dev/null +++ b/man/check_data_lifetable.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkers.R +\name{check_data_lifetable} +\alias{check_data_lifetable} +\title{Check consistency of lifetable input data} +\usage{ +check_data_lifetable(data) +} +\arguments{ +\item{data}{A data frame containing lifetable-related columns. Must include at least one of: \code{nMx}, \code{nlx}, \code{nqx}, \code{npx}, \code{ndx}, or \code{nLx}. +Optionally may contain an \code{Age} column and a grouping column \code{.id}.} +} +\value{ +A data frame summarizing validation results for each group. The returned object contains: +\describe{ +\item{check}{Name of the validation performed} +\item{message}{Error message if the check fails, otherwise \code{NA}} +\item{pass}{Either \code{"Pass"} or \code{"Fail"}} +\item{.id}{Group identifier} +} +} +\description{ +Validates a data frame intended for lifetable reconstruction. +The function checks whether the input contains at least one column that can be used to reconstruct a full lifetable and performs a set of structural and value-based validations. +If no grouping variable \code{.id} is present, one is automatically created with a single level \code{"all"}. +} +\details{ +Checks input data for lifetable reconstruction + +The function searches for at least one of the following columns (in order of preference): +\itemize{ +\item \code{nMx} +\item \code{nlx} +\item \code{nqx} +\item \code{npx} +\item \code{ndx} +\item \code{nLx} +} +The first available column is used for validation checks. The following checks are performed: +\itemize{ +\item Numeric type validation +\item Missing value detection +\item Non-negativity constraint +} +If an \code{Age} column is present, additional checks are performed: +\itemize{ +\item Age coherence (numeric validity) +\item Redundancy detection +\item Sequential ordering +} +Validation is performed separately for each \code{.id} group. +} +\examples{ +library(readr) +fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in$nMx <- data_in$Deaths / data_in$Exposures +data_in <- data_in[data_in$`.id` == 1, ] +check_data_lifetable(data_in) + +} +\seealso{ +\code{\link{age2int}}, +\code{\link{is_age_coherent}}, +\code{\link{is_age_redundant}}, +\code{\link{is_age_sequential}} +} diff --git a/man/check_heaping_general.Rd b/man/check_heaping_general.Rd index adaaa2a..7f36c73 100644 --- a/man/check_heaping_general.Rd +++ b/man/check_heaping_general.Rd @@ -2,26 +2,55 @@ % Please edit documentation in R/input_diagnostics.R \name{check_heaping_general} \alias{check_heaping_general} -\title{\code{check_heaping_general}} +\title{General age-heaping diagnostics} \usage{ -check_heaping_general(data, y) +check_heaping_general(data, y, units = c("count", "rate", "proportion")) } \arguments{ -\item{data}{data.frame. User file from the read_data command with the minimum data on \code{Exposures}, \code{Death} and \code{Age}. Data can be both in 5 and 1 year age intervals} +\item{data}{A data frame containing at least \code{Age} and the variable \code{y}. If \code{.id} is present, diagnostics are computed by group; otherwise all observations are treated as one group.} -\item{y}{character.Variable name for which the heaping should be checked \code{Deaths} or \code{Exposures}.} +\item{y}{Character string giving the name of the age-specific variable.} + +\item{units}{Type of input data. One of \code{"count"}, \code{"rate"}, or \code{"proportion"}.} } \value{ -A data.frame with 3 columns \code{method} - the method used for age heaping evaluation, \code{result} - the resulting heaping measure, and \code{.id} - user specified groups identifier (if missing is set to "all") +A data frame with one row per index and group, containing: +\itemize{ +\item \code{method} – diagnostic index name +\item \code{result} – numeric value +\item \code{level} – qualitative data-quality category +\item \code{color} – associated color code +\item \verb{age scale} – \code{"single"} or \code{"5-year"} +\item \code{.id} – group identifier +} } \description{ -Check the age heaping for 5 or 1 year data. +Computes age-heaping and age-irregularity indices from age-specific counts, rates, or proportions. The set of diagnostics depends on the age scale: digit-preference indices are used for single-year ages, while irregularity indices are used for grouped ages. +} +\details{ +\itemize{ +\item Single-year ages: Bachi and Myers digit-preference indices. +\item Grouped ages: roughness and sawtooth irregularity indices. +For \code{"count"} data, values are internally converted to proportions. +For \code{"rate"} and \code{"proportion"} data, values are normalized to sum to one +to ensure comparability across ages. +} } \examples{ -\dontrun{ -check_heaping_general( - data = data, - y = "Exposures") +library(readr) +fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in$nMx <- data_in$Deaths / data_in$Exposures +data_in <- data_in[data_in$.id == 1, ] +check_heaping_general(data = data_in, y = "Deaths", units = "count") +check_heaping_general(data = data_in, y = "nMx", units = "rate") } - +\references{ +Bachi (1951); Myers (1940); United Nations (1955). +} +\seealso{ +\code{\link{check_heaping_bachi}}, +\code{\link{check_heaping_myers}}, +\code{\link{check_heaping_roughness}}, +\code{\link{check_heaping_sawtooth}} } diff --git a/man/check_smooth.Rd b/man/check_smooth.Rd deleted file mode 100644 index 0405bb3..0000000 --- a/man/check_smooth.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/smoothing1d.R -\name{check_smooth} -\alias{check_smooth} -\title{\code{check_smooth}} -\usage{ -check_smooth(data_in, data_out) -} -\arguments{ -\item{data_in}{a data.frame or tibble. The original data provided by user} - -\item{data_out}{a data.frame or tibble. The data.frame output of the \code{interpolate} function.} -} -\value{ -A figure of original data as points and smooth result as line for each \code{.id} -} -\description{ -Creates a plot of original data as points and smooth result as line for each \code{.id}. -} -\examples{ -library(tibble) -x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, - 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, - 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, - 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, - 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, - 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, - 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, - 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, - 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, - 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, - 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, - 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, - 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, - 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, - 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, - 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) - -y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, - 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, - 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, - 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, - 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, - 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, - 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, - 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, - 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, - 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, - 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, - 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, - 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, - 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, - 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, - 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, - 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, - 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, - 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, - 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, - 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, - 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, - 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, - 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, - 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, - 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, - 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, - 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, - 1.56845271587372, 1.37076985836029) -data_in <- tibble(x = x, - y = y, - w = 1) -xout <- 1950:2018 + .5 - -data_out <- smooth1d(data_in, method = "gam-ps", xout = xout) - -check_smooth(data_in, data_out) - -} diff --git a/man/graduate_auto.Rd b/man/graduate_auto.Rd index 6a3413e..979e55f 100644 --- a/man/graduate_auto.Rd +++ b/man/graduate_auto.Rd @@ -43,7 +43,7 @@ fpath <- system.file("extdata", "five_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) |> + select(-1) |> filter(.id == 1) ex1 <- graduate_auto(data_in, diff --git a/man/graduate_auto_5.Rd b/man/graduate_auto_5.Rd index 81fcb1f..ab6ec32 100644 --- a/man/graduate_auto_5.Rd +++ b/man/graduate_auto_5.Rd @@ -26,7 +26,7 @@ fpath <- system.file("extdata", "five_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) |> + select(-1) |> filter(.id == 1) graduate_auto_5(data_in, "Exposures")$Exposures diff --git a/man/graduate_time.Rd b/man/graduate_time.Rd new file mode 100644 index 0000000..7d43660 --- /dev/null +++ b/man/graduate_time.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/graduation.R +\name{graduate_time} +\alias{graduate_time} +\title{Graduate grouped time data to single-year estimates +Expands grouped time data (e.g., 5-year intervals) into single-year values using various graduation methods. The function supports smoothing, monotone interpolation, uniform splitting, cubic regression splines, penalized composite link models (PCLM), and LOESS smoothing.} +\usage{ +graduate_time( + data_in, + method = c("loess", "native", "cubic", "mono", "uniform", "pclm"), + X, + Y, + timeInt = 5 +) +} +\arguments{ +\item{data_in}{Data frame containing grouped time data.} + +\item{method}{Graduation method. One of \code{"loess"}, \code{"native"}, \code{"cubic"}, \code{"mono"}, \code{"uniform"}, \code{"pclm"}.} + +\item{X}{Name of the time variable (character string).} + +\item{Y}{Name of the grouped value variable (character string).} + +\item{timeInt}{Width of grouping interval (default = 5).} +} +\value{ +A data frame with columns: +\itemize{ +\item \code{time} — single-year time values +\item \code{value} — graduated single-year estimates +\item \code{.id} — subgroup identifier +} +} +\description{ +Available methods: +\itemize{ +\item \code{"uniform"} — distributes totals evenly within each interval +\item \code{"mono"} — monotone cubic interpolation (Hyman filtered spline) +\item \code{"native"} — smoothing spline via \code{stats::smooth.spline} +\item \code{"cubic"} — cubic regression spline via \code{mgcv::gam} +\item \code{"loess"} — local polynomial smoothing via \code{stats::loess} +\item \code{"pclm"} — penalized composite link model via \code{ungroup::pclm} +} +} +\details{ +Block totals are preserved for all smoothing-based methods by rescaling +predictions within each interval to match original grouped totals. + +If column \code{.id} exists in \code{data_in}, graduation is performed separately within each subgroup. Otherwise all observations are treated as a single group. +For spline-based methods, predicted single-year values are rescaled +within each interval so that grouped totals are exactly preserved. +The PCLM method relies on the \pkg{ungroup} package and may internally rescale small counts to avoid negative fitted values. +} +\examples{ +df <- data.frame( + Year = seq(1950, 2015, by = 5), + Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, 560874.1, + 545608.3, 555335.9, 594584.6, 608425.3, 638167.3, + 655438.6, 689386.4, 740519.0, 804439.3) +) + +graduate_time( + df, + method = "pclm", + X = "Year", + Y = "Deaths", + timeInt = 5 +) + +} diff --git a/man/odap_opag.Rd b/man/odap_opag.Rd index f6a323e..bca9cf4 100644 --- a/man/odap_opag.Rd +++ b/man/odap_opag.Rd @@ -70,7 +70,6 @@ The function uses internal helpers such as \code{conditional_filter()}, \code{conditional_arrange()}, and \code{create_groupid()} to standardize and align data. } \examples{ -\dontrun{ library(dplyr) data_in <- tibble( @@ -90,7 +89,3 @@ res$data_out$India # Plot original vs redistributed population print(res$figures$India) } -} -\seealso{ -\code{\link[DemoTools]{OPAG}}, \code{\link{link[DemoTools]{lt_single_mx}}, \code{\link{\link[DemoTools]{groupAges}} -} diff --git a/man/plot_smooth_compare.Rd b/man/plot_smooth_compare.Rd index 11db1a8..2e72669 100644 --- a/man/plot_smooth_compare.Rd +++ b/man/plot_smooth_compare.Rd @@ -29,8 +29,8 @@ fpath <- system.file("extdata", "abridged_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath, show_col_types = FALSE) |> - dplyr::select(-1) |> - dplyr::filter(.id == 1) + select(-1) |> + filter(.id == 1) data_out <- smooth_flexible_chunk( data_in, diff --git a/man/smooth1d.Rd b/man/smooth1d.Rd index ba2f3b0..fc41572 100644 --- a/man/smooth1d.Rd +++ b/man/smooth1d.Rd @@ -2,99 +2,125 @@ % Please edit documentation in R/smoothing1d.R \name{smooth1d} \alias{smooth1d} -\title{\code{smooth1d}} +\title{One–Dimensional Smoothing Wrapper for Age or Time Series +\code{smooth1d()} provides a unified interface for smoothing one–dimensional demographic or time–series data (e.g., age schedules or time trends). +The function supports multiple smoothing engines including \code{supsmu}, \code{lowess}, \code{loess}, cubic splines, and GAM-based methods. +Optional transformations (log, sqrt, logit) can be applied prior to smoothing. +The function automatically handles grouped data via a \code{.id} column.} \usage{ smooth1d( data_in, - method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps")[1], + units = c("count", "rate", "proportion"), + X = c("Age", "Time"), + Y = c("Exposures"), + scale = c("log", "sqrt", "logit", "none"), + method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps"), smoothing_par = 1, - xout = unique(data_in[["x"]]), - xname = "x", - yname = "y" + xout = unique(data_in[[X]]) ) } \arguments{ -\item{data_in}{\code{data.frame} with x and y coordinates to fit to. Optionally with \code{weights}} +\item{data_in}{A data.frame or tibble containing the input data.} -\item{method}{character. Smoothing method desired. options \code{"supsmu"} (default),\code{"lowess"},\code{"loess"},\code{"cubicspline"},\code{"gam-tp"},\code{"gam-ps"}} +\item{units}{Character string. One of: +\itemize{ +\item \code{"count"} +\item \code{"rate"} +\item \code{"proportion"} +}} -\item{smoothing_par}{smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1} +\item{X}{Character string. Name of the independent variable (typically \code{"Age"} or \code{"Time"}).} -\item{xout}{numeric vector of coordinates to predict for. Defaults to original unique coordinates.} +\item{Y}{Character string. Name of the dependent variable.} -\item{xname}{name of variable containing x coordinate, default \code{x}} +\item{scale}{Optional transformation applied before smoothing: +\itemize{ +\item \code{"log"} +\item \code{"sqrt"} +\item \code{"logit"} +\item \code{"none"} +}} -\item{yname}{name of variable containing y coordinate, default \code{y}} +\item{method}{Smoothing engine: +\itemize{ +\item \code{"supsmu"} +\item \code{"lowess"} +\item \code{"loess"} +\item \code{"cubicspline"} +\item \code{"gam-tp"} (thin plate regression spline) +\item \code{"gam-ps"} (P-spline) +}} + +\item{smoothing_par}{Numeric smoothing parameter. Interpretation depends on method.} + +\item{xout}{Optional vector of evaluation points. Defaults to unique values of \code{X}.} +} +\value{ +A named list with: +\itemize{ +\item \code{result} — Tibble containing smoothed values +\item \code{plot} — ggplot object comparing raw and smoothed values +} } \description{ -Smooth a univariate time series, optionally using \code{weights.} Choose between the super-smoother (\code{"supsmu"}) method, loess (\code{"lowess"} or \code{"loess"}) , smoothing splines (\code{"cubicsplines"}), thin-plate splines (\code{"gam-tp"}), or p-splines (\code{"gam-ps"}). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (\code{xout}). If grouping variables have been declared, then we use the same parameters for each subset. +One–Dimensional Smoothing Wrapper for Age or Time Series +\code{smooth1d()} provides a unified interface for smoothing one–dimensional demographic or time–series data (e.g., age schedules or time trends). +The function supports multiple smoothing engines including \code{supsmu}, \code{lowess}, \code{loess}, cubic splines, and GAM-based methods. +Optional transformations (log, sqrt, logit) can be applied prior to smoothing. +The function automatically handles grouped data via a \code{.id} column. } \details{ -\code{"supsmu"} method takes a \code{smoothing_par} between 0 and 10. \code{"lowess"} and +If the input data does not contain a \code{.id} column, a default group \code{"all"} is created. +Weights default to 1 if not supplied. +Transformations are applied before smoothing but not automatically back-transformed. } \examples{ -library(tibble) -x = c(1950.77104722793, 1951.77104722793, 1952.77104722793, 1952.77104722793, - 1953.77104722793, 1954.77104722793, 1955.77104722793, 1956.77104722793, - 1957.77104722793, 1957.77104722793, 1958.77104722793, 1959.77104722793, - 1963.5, 1964.5, 1965.5, 1966.5, 1967.5, 1968.5, 1969.5, 1970.5, - 1970.5, 1971.5, 1971.5, 1972.5, 1972.5, 1973.5, 1973.5, 1974.5, - 1974.5, 1975.5, 1975.5, 1976.5, 1976.5, 1977.5, 1977.5, 1978.5, - 1978.5, 1979.5, 1979.5, 1980.5, 1980.5, 1981.5, 1981.5, 1982.5, - 1982.5, 1983.5, 1983.5, 1984.5, 1984.5, 1985.5, 1985.5, 1986.5, - 1986.5, 1987.5, 1987.5, 1988.5, 1988.5, 1989.5, 1989.5, 1990.5, - 1990.5, 1991.5, 1991.5, 1992.5, 1992.5, 1993.5, 1993.5, 1994.5, - 1994.5, 1995.5, 1995.5, 1996.5, 1996.5, 1997.5, 1997.5, 1998.5, - 1998.5, 1998.5, 1999.5, 1999.5, 1999.5, 2000.5, 2000.5, 2000.5, - 2001.5, 2001.5, 2001.5, 2001.5, 2002.5, 2002.5, 2002.5, 2003.5, - 2003.5, 2003.5, 2004.5, 2004.5, 2005.5, 2005.5, 2006.5, 2006.5, - 2007.5, 2007.5, 2008.5, 2008.5, 2009.5, 2009.5, 2010.5, 2010.5, - 2011.5, 2012.5, 2013.5, 2014.5, 2015.5, 2016.5, 2017.5) +# Example 1: Age smoothing of mortality rates (log scale) +library(readr) -y = c(5.28312492370605, 5.4010272026062, 5.55507040023804, 5.52694797515869, - 5.65941572189331, 5.81246614456177, 5.95277643203735, 6.07998991012573, - 6.20043277740479, 6.23209381103516, 6.4145884513855, 6.44994592666626, - 5.77428722381592, 6.09462738037109, 6.31580305099487, 5.68929624557495, - 6.34508848190308, 5.67744398117065, 5.6477165222168, 5.12978315353394, - 4.83979654312134, 5.40941429138184, 4.93997049331665, 5.06586456298828, - 4.68799591064453, 4.31546640396118, 4.18193292617798, 3.75800633430481, - 3.82137632369995, 3.17197370529175, 3.41054058074951, 3.08278703689575, - 3.04342699050903, 3.01940298080444, 2.9705445766449, 2.96306347846985, - 2.88018417358398, 2.72204685211182, 2.72689270973206, 3.43744516372681, - 2.88990616798401, 3.12944483757019, 3.18246674537659, 3.20358324050903, - 3.0853967666626, 3.38533687591553, 3.18455958366394, 3.14047956466675, - 3.08752226829529, 3.15941309928894, 3.09168982505798, 3.22912931442261, - 2.95333743095398, 3.05898070335388, 2.91993451118469, 3.29801154136658, - 2.93581032752991, 3.18667984008789, 2.92741537094116, 3.10476756095886, - 2.86983323097229, 3.31816387176514, 2.87090802192688, 3.57006907463074, - 3.20188307762146, 3.38520860671997, 3.1142041683197, 3.05472731590271, - 2.88588929176331, 2.85394668579102, 2.69696426391602, 2.71265292167664, - 2.49935364723206, 2.55814361572266, 2.37988924980164, 2.35105299949646, - 2.32249999046326, 2.21962690353394, 2.46929597854614, 2.43600010871887, - 2.6223669052124, 2.31005716323853, 2.33249998092651, 2.408358335495, - 2.39718627929688, 2.38599991798401, 3, 2.51987075805664, 2.1131649017334, - 2.13849997520447, 2.04657125473022, 2.06265759468079, 2.10050010681152, - 1.98469293117523, 2.09120869636536, 2.2427921295166, 1.97930490970612, - 2.15000796318054, 2.06816005706787, 2.18898129463196, 1.77942955493927, - 1.99617087841034, 1.89081299304962, 2.0644690990448, 1.85119438171387, - 2.03594541549683, 1.83351850509644, 2.02167296409607, 1.89562964439392, - 1.89205574989319, 1.85763072967529, 1.68259692192078, 1.69228148460388, - 1.56845271587372, 1.37076985836029) -data_in <- tibble(x=x,y=y,w=1) -xout <- 1950:2018 + .5 -\dontrun{ - plot(data_in$x, data_in$y) - lines(xout, smooth1d(data_in, method = "supsmu", - xout = xout, smoothing_par = .2)$y, - col = "red") - lines(xout, smooth1d(data_in, method = "loess", - xout = xout, smoothing_par = .2)$y, - col = "blue", lty = "82") - lines(xout, smooth1d(data_in, method = "gam-ps", xout = xout)$y, - col = "forestgreen" ,lty="82") - lines(xout, smooth1d(data_in, method = "cubicspline", - xout = xout, smoothing_par =1)$y, - col = "purple", lty="22") -} +fpath <- system.file( + "extdata", + "single_hmd_spain.csv.gz", + package = "ODAPbackend" +) + +data_in <- read_csv(fpath, col_select = c(-1), show_col_types = FALSE) +data_in$nMx <- data_in$Deaths / data_in$Exposures +data_in <- data_in[data_in$.id == 1, ] +names(data_in) <- c(".id", "Time", "Age", "Sex", + "Deaths", "Exposures", "nMx") + +z <- smooth1d( + data_in = data_in, + units = "rate", + X = "Age", + Y = "nMx", + method = "supsmu", + scale = "log" +) + +z$result +z$plot + +# Example 2: Time smoothing of counts +df <- data.frame( + Time = seq(1950, 2015, by = 5), + Deaths = c(403564.9, 426012.0, 466215.9, 523753.8, + 560874.1, 545608.3, 555335.9, 594584.6, + 608425.3, 638167.3, 655438.6, 689386.4, + 740519.0, 804439.3) +) + +z <- smooth1d( + data_in = df, + units = "count", + X = "Time", + Y = "Deaths", + method = "supsmu", + scale = "none" +) + +z$result +z$plot } diff --git a/man/smooth1d_chunk.Rd b/man/smooth1d_chunk.Rd deleted file mode 100644 index 0028484..0000000 --- a/man/smooth1d_chunk.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/smoothing1d.R -\name{smooth1d_chunk} -\alias{smooth1d_chunk} -\title{\code{smooth1d_chunk}} -\usage{ -smooth1d_chunk( - data_in., - method = c("supsmu", "lowess", "loess", "cubicspline", "gam-tp", "gam-ps")[1], - smoothing_par = 1, - xout = data_in.[["x"]], - xname = "x", - yname = "y" -) -} -\arguments{ -\item{data_in.}{\code{data.frame} with x and y coordinates to fit to. Optionally with \code{weights}. Should refer to a single subset of data.} - -\item{method}{character. Smoothing method desired. options \code{"supsmu"} (default),\code{"lowess"},\code{"loess"},\code{"cubicspline"},\code{"gam-tp"},\code{"gam-ps"}} - -\item{smoothing_par}{smoothing parameter, interpretation varies by method, but higher always means smoother.Default 1} - -\item{xout}{numeric vector of coordinates to predict for. Defaults to original unique coordinates.} - -\item{xname}{name of variable containing x coordinate, default \code{x}} - -\item{yname}{name of variable containing y coordinate, default \code{y}} -} -\description{ -Smooth a univariate time series, optionally using \code{weights.} Choose between the super-smoother (\code{"supsmu"}) method, loess (\code{"lowess"} or \code{"loess"}) , smoothing splines (\code{"cubicsplines"}), thin-plate splines (\code{"gam-tp"}), or p-splines (\code{"gam-ps"}). Input data may have multiple observations per x coordinate. Output can be for arbitrary x-coordinates (\code{xout}). -} -\details{ -\code{"supsmu"} method takes a \code{smoothing_par} between 0 and 10. \code{"lowess"} and -} diff --git a/man/smooth_flexible.Rd b/man/smooth_flexible.Rd index e5fde45..fc17fce 100644 --- a/man/smooth_flexible.Rd +++ b/man/smooth_flexible.Rd @@ -54,7 +54,7 @@ fpath <- system.file("extdata", "single_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) + select(-1) ex1 <- smooth_flexible( data_in, variable = "Exposures", diff --git a/man/smooth_flexible_chunk.Rd b/man/smooth_flexible_chunk.Rd index 86f4adf..440b363 100644 --- a/man/smooth_flexible_chunk.Rd +++ b/man/smooth_flexible_chunk.Rd @@ -51,7 +51,7 @@ fpath <- system.file("extdata", "abridged_hmd_spain.csv.gz", package = "ODAPbackend") data_in <- read_csv(fpath) |> - dplyr::select(-1) |> + select(-1) |> filter(.id == 1) ex1 <- smooth_flexible_chunk(