diff --git a/DESCRIPTION b/DESCRIPTION index 38ce819..4508342 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,6 +29,7 @@ Collate: 'utils.R' 'prevalence-survey.R' 'prevalence-simple.R' + 'prevalence-hybrid-sw.R' 'assertions.R' 'prevalence.R' 'z-score-helper.R' diff --git a/NAMESPACE b/NAMESPACE index d004407..a35992a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,18 @@ # Generated by roxygen2: do not edit by hand +S3method(column_names,hybrid_design) S3method(column_names,simple_design) S3method(column_names,survey_design) +S3method(column_values,hybrid_design) S3method(column_values,simple_design) S3method(column_values,survey_design) +S3method(compute_prevalence_estimates_for_column_by,hybrid_design) S3method(compute_prevalence_estimates_for_column_by,simple_design) S3method(compute_prevalence_estimates_for_column_by,survey_design) +S3method(compute_prevalence_sample_size_by,hybrid_design) S3method(compute_prevalence_sample_size_by,simple_design) S3method(compute_prevalence_sample_size_by,survey_design) +S3method(compute_prevalence_zscore_summaries_by,hybrid_design) S3method(compute_prevalence_zscore_summaries_by,simple_design) S3method(compute_prevalence_zscore_summaries_by,survey_design) export(anthro_api_compute_prevalence) diff --git a/NEWS.md b/NEWS.md index 4f8fc54..575f698 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,8 +14,10 @@ ## Performance -* For inputs with `cluster/strata = NULL` and `sw = NULL or 1` a faster - method was added for this special case to compute `anthro_prevalence`. +* For inputs with `cluster/strata = NULL` a faster + method was added for this special case to compute `anthro_prevalence`. This + method yields the same results within an observed tolerance of `< 0.0001` on + test data. ## Bugfix diff --git a/R/prevalence-hybrid-sw.R b/R/prevalence-hybrid-sw.R new file mode 100644 index 0000000..f1ea439 --- /dev/null +++ b/R/prevalence-hybrid-sw.R @@ -0,0 +1,105 @@ +#' The following is a hybrid backed for cases with a sampling weight > 0. +#' It recreates the estimates for z-score summaries and population estimates, +#' but uses the survey package to compute estimates around the 0/1 indicators. +#' +#' The z-score estimates follow the method uses in the {survey} package by +#' T. Lumley. +#' +#' Reference: +#' T. Lumley (2024) "survey: analysis of complex survey samples". +#' R package version 4.4. +#' @noRd + +#' @export +column_names.hybrid_design <- function(x) { + colnames(x$data) +} + +#' @export +column_values.hybrid_design <- function(x, col) { + x$data[[col]] +} + +#' @export +#' @include prevalence-simple.R +compute_prevalence_zscore_summaries_by.hybrid_design <- function( + data, + indicator, + subset_col_name +) { + zscore_col_name <- prev_zscore_value_column(indicator) + compute_and_aggregate( + data, + zscore_col_name, + subset_col_name, + compute = zscore_estimate_weighted, + empty_data_prototype = data.frame( + r = NA_real_, + se = NA_real_, + ll = NA_real_, + ul = NA_real_, + stdev = NA_real_ + ) + ) +} + +#' @export +compute_prevalence_estimates_for_column_by.hybrid_design <- compute_prevalence_estimates_for_column_by.survey_design + +#' @export +#' @include prevalence-simple.R +compute_prevalence_sample_size_by.hybrid_design <- function( + data, + indicator, + subset_col_name +) { + column_name <- prev_prevalence_column_name(indicator) + compute_and_aggregate( + data, + column_name, + subset_col_name, + compute = sample_size_weighted, + empty_data_prototype = data.frame( + pop = 0, + unwpop = 0 + ) + ) +} + +#' @importFrom stats sd plogis qt +zscore_estimate_weighted <- function(x, N, weights, empty_data_prototype) { + non_na <- !is.na(x) + x <- x[non_na] + if (length(x) == 0) { + return(empty_data_prototype) + } + w <- weights[non_na] + n <- length(x) + sw <- sum(w) + r <- sum(w * x) / sw + w_normalized <- w / sw + stdev <- sqrt(sum(w_normalized * (x - r)^2) * n / (n - 1)) + se <- sample_se_weights(x, w, r, sw, N) + t_value <- qt(1 - prevalence_significance_level / 2, N - 1) + cis <- r + se * c(-t_value, t_value) + data.frame( + r = r, + se = se, + ll = cis[1L], + ul = cis[2L], + stdev = stdev + ) +} + +sample_size_weighted <- function(x, N, weights, empty_data_prototype) { + x <- as.numeric(!is.na(x)) + pop <- as.numeric(sum(x * weights, na.rm = TRUE)) + n <- as.numeric(sum(x)) + data.frame(pop = pop, unwpop = n) +} + +sample_se_weights <- function(x, weights, x_mean, n, N) { + scale <- N / (N - 1) + x_deviation <- weights * (x - x_mean) + sqrt(scale) / n * sqrt(sum(x_deviation * x_deviation)) +} diff --git a/R/prevalence-simple.R b/R/prevalence-simple.R index 8767e34..4e3439a 100644 --- a/R/prevalence-simple.R +++ b/R/prevalence-simple.R @@ -93,6 +93,8 @@ compute_and_aggregate <- function( ) { col_values <- column_values(data, value_column) N <- length(col_values) + weights <- data$data[["sampling_weights"]] + has_weights <- !is.null(weights) grouping <- column_values(data, subset_column) groups <- if (is.factor(grouping)) { levels(grouping) @@ -103,11 +105,21 @@ compute_and_aggregate <- function( values <- vector(mode = "list", length(groups)) names(values) <- groups for (group in groups) { - values[[group]] <- compute( - col_values[grouping == group], - N = N, - empty_data_prototype = empty_data_prototype - ) + values[[group]] <- if (has_weights) { + subroup <- grouping == group + compute( + col_values[subroup], + N = N, + weights = weights[subroup], + empty_data_prototype = empty_data_prototype + ) + } else { + compute( + col_values[grouping == group], + N = N, + empty_data_prototype = empty_data_prototype + ) + } } Group <- names(values) data <- do.call(rbind, values) diff --git a/R/prevalence.R b/R/prevalence.R index ef910f4..61b2e98 100644 --- a/R/prevalence.R +++ b/R/prevalence.R @@ -165,6 +165,7 @@ #' @include anthro-package.R #' @include assertions.R #' @include prevalence-simple.R +#' @include prevalence-hybrid-sw.R #' @include prevalence-survey.R #' @importFrom stats confint #' @importFrom stats as.formula @@ -562,6 +563,12 @@ generate_cutoffs <- function(data_column) { }) } +#' build_design constructs a custom design object that is used to compute +#' all indicators. +#' The function decides if the survey package is used for all indicator +#' or if a specialized method can be used to compute the results faster. For +#' example when there are no cluster/strata and sampling weights. +#' @noRd build_design <- function(survey_data) { # we can override the choice here using internal options. # these options are not part of the public api and can change at any time. @@ -575,6 +582,11 @@ build_design <- function(survey_data) { if (is_simple_problem) { return(build_simple_design(survey_data)) } + is_simple_problem_with_sw <- is.null(survey_data[["cluster"]]) && + is.null(survey_data[["strata"]]) + if (is_simple_problem_with_sw) { + return(build_hybrid_design(survey_data)) + } build_survey_design(survey_data) } @@ -910,6 +922,25 @@ build_simple_design <- function(survey_data) { ) } +build_hybrid_design <- function(survey_data) { + # for cases with sw > 0 but cluster/strata = NULL, we just need + # a single survey design object + design <- svydesign( + ids = ~1, + strata = NULL, + weights = ~sampling_weights, + data = survey_data, + nest = TRUE + ) + structure( + list( + data = survey_data, + design = design + ), + class = "hybrid_design" + ) +} + #' @importFrom survey svydesign build_survey_design <- function(survey_data) { has_cluster_info <- !is.null(survey_data[["cluster"]]) diff --git a/README.md b/README.md index a8753c7..bb4b3e6 100644 --- a/README.md +++ b/README.md @@ -55,16 +55,16 @@ anthro_zscores( weight = c(18, 15, 10, 15), lenhei = c(120, 80, 100, 100) ) -#> clenhei cbmi cmeasure csex zlen flen zwei fwei zwfl fwfl zbmi fbmi zhc -#> 1 120 12.5000 1 7.31 1 2.20 0 -2.39 0 -3.01 0 NA -#> 2 80 23.4375 2 -3.50 0 0.95 0 4.13 0 4.66 0 NA -#> 3 100 10.0000 1 1.62 0 -2.76 0 -5.19 1 -5.61 1 NA -#> 4 100 15.0000 1 1.70 0 0.69 0 -0.29 0 -0.58 0 NA -#> fhc zac fac zts fts zss fss -#> 1 NA NA NA NA NA NA NA -#> 2 NA NA NA NA NA NA NA -#> 3 NA NA NA NA NA NA NA -#> 4 NA NA NA NA NA NA NA +#> clenhei cmeasure c9mo_flag cbmi csex zlen flen zwei fwei zwfl fwfl +#> 1 120 0 12.5000 1 7.31 1 2.20 0 -2.39 0 +#> 2 80 0 23.4375 2 -3.50 0 0.95 0 4.13 0 +#> 3 100 0 10.0000 1 1.62 0 -2.76 0 -5.19 1 +#> 4 100 0 15.0000 1 1.70 0 0.69 0 -0.29 0 +#> zbmi fbmi zhc fhc zac fac zts fts zss fss +#> 1 -3.01 0 NA NA NA NA NA NA NA NA +#> 2 4.66 0 NA NA NA NA NA NA NA NA +#> 3 -5.61 1 NA NA NA NA NA NA NA NA +#> 4 -0.58 0 NA NA NA NA NA NA NA NA ``` The returned value is a `data.frame` that can further be processed or diff --git a/tests/testthat/test-prevalence-simple-estimates.R b/tests/testthat/test-prevalence-simple-estimates.R index 248b2b2..1a53d16 100644 --- a/tests/testthat/test-prevalence-simple-estimates.R +++ b/tests/testthat/test-prevalence-simple-estimates.R @@ -1,4 +1,4 @@ -test_that("survey and approximation yield are equal within tolerance", { +test_that("survey and approximation results are equal within tolerance", { set.seed(1) n <- 200 data <- data.frame( @@ -31,9 +31,167 @@ test_that("survey and approximation yield are equal within tolerance", { } }) +test_that("survey and approximation with sw results are equal within tolerance", { + set.seed(1) + n <- 200 + data <- data.frame( + sex = sample(1:2, n, replace = TRUE), + age = sample(0:50, n, replace = TRUE), + weight = pmax(rnorm(n, 20, 10), 1), + lenhei = pmax(rnorm(n, 80, 20), 30) + ) + data$sw <- runif(nrow(data)) / nrow(data) + data$weight[43] <- NA_real_ + res_simple <- anthro_prevalence( + data$sex, + data$age, + is_age_in_month = TRUE, + weight = data$weight, + lenhei = data$lenhei, + sw = data$sw + ) + opts <- options("anthro.internal.compute_method" = "survey") + on.exit(options(opts)) + res_survey <- anthro_prevalence( + data$sex, + data$age, + is_age_in_month = TRUE, + weight = data$weight, + lenhei = data$lenhei, + sw = data$sw + ) + expect_equal(colnames(res_simple), colnames(res_survey)) + expect_equal(nrow(res_simple), nrow(res_survey)) + for (col in colnames(res_simple)) { + expect_equal(res_simple[[!!col]], res_survey[[!!col]], tolerance = 0.0001) + } +}) + +test_that("sw > 0 is approximated correctly within tolerance", { + res <- anthro_prevalence( + sex = c(1,2,1), + age = c(50, 50, NA_real_), + is_age_in_month = TRUE, + weight = 80, + lenhei = 100, + sw = 0.05 + ) + expect_true(all(!is.na(res$HAZ_pop))) +}) + +test_that("sw > 0 is approximated correctly within tolerance", { + set.seed(1) + data <- data.frame( + sex = sample(1:2, 100, replace = TRUE), + age = sample(0:50, 100, replace = TRUE), + weight = pmax(rnorm(100, 20, 10), 1), + lenhei = pmax(rnorm(100, 80, 20), 30) + ) + data$sw <- runif(nrow(data)) / nrow(data) + res <- anthro_prevalence( + data$sex, + data$age, + is_age_in_month = TRUE, + weight = data$weight, + lenhei = data$lenhei, + sw = data$sw + ) + zscores <- anthro_zscores( + data$sex, + data$age, + is_age_in_month = TRUE, + weight = data$weight, + lenhei = data$lenhei + ) + zscores <- cbind(zscores, data) + zscores$zlen <- ifelse(zscores$flen == 1, NA_real_, zscores$zlen) + design <- survey::svydesign( + ids = ~1, + data = zscores, + nest = TRUE, + weights = ~sw + ) + design_unweighted <- survey::svydesign( + ids = ~1, + data = zscores, + nest = TRUE, + weights = NULL + ) + expected <- survey::svyby( + ~zlen, + ~sex, + design, + survey::svymean, + na.rm = TRUE, + na.rm.all = TRUE, + drop.empty.groups = FALSE + ) + expected_cis <- confint( + expected, + df = degf(design), + level = 1 - prevalence_significance_level + ) + observed <- res[c(8, 9), c("Group", "HA_r", "HA_se", "HA_ll", "HA_ul")] + expect_equal( + observed$HA_r, + rev(expected$zlen) + ) + expect_equal( + observed$HA_se, + rev(expected$se) + ) + expect_equal( + observed$HA_ll, + rev(as.numeric(expected_cis[, 1])) + ) + expect_equal( + observed$HA_ul, + rev(as.numeric(expected_cis[, 2])) + ) + + observed <- res[1, c("Group", "HA_r", "HA_se")] + expected_zlen_all <- survey::svymean( + ~zlen, + design, + na.rm = TRUE, + na.rm.all = TRUE, + drop.empty.groups = FALSE + ) + expect_equal( + as.numeric(observed[2:3]), + c(as.numeric(expected_zlen_all), as.numeric(survey::SE(expected_zlen_all))) + ) + expected_total_weighted <- survey::svytotal( + ~ I(!is.na(zlen)), + design, + na.rm = TRUE, + na.rm.all = TRUE, + drop.empty.groups = FALSE + ) + expected_total_unweighted <- survey::svytotal( + ~ I(!is.na(zlen)), + design_unweighted, + na.rm = TRUE, + na.rm.all = TRUE, + drop.empty.groups = FALSE + ) + observed <- res[1, 2:3, drop = TRUE] + expect_equal( + observed$HAZ_pop, + as.numeric(expected_total_weighted[2]) + ) + expect_equal( + observed$HAZ_unwpop, + as.numeric(expected_total_unweighted[2]) + ) +}) + test_that("zscore estimates should not overflow", { x <- rnorm(100000) expect_no_warning( zscore_estimate(x, length(x), data.frame()) ) + expect_no_warning( + zscore_estimate_weighted(x, length(x), rep.int(1, length(x)), data.frame()) + ) })