3.4 Benchmark Process

  1. From the census extract the total number of people by DAM2
total_pp <- readRDS(file = "Recursos/04_FH_Arcosin/06_censo_mrp.rds") %>% 
   mutate(dam = substr(dam2,1,2))


N_dam_pp <- total_pp %>%   group_by(dam,dam2) %>%  
            summarise(total_pp = sum(n) ) %>% 
  group_by(dam) %>% mutate(dam_pp = sum(total_pp))

tba(N_dam_pp %>% data.frame() %>% slice(1:10))
dam dam2 total_pp dam_pp
01 0101 35750 88799
01 0102 26458 88799
01 0103 26591 88799
02 0201 58937 573065
02 0202 43700 573065
02 0203 35309 573065
02 0204 47230 573065
02 0205 35694 573065
02 0206 38832 573065
02 0207 42103 573065
  1. Obtaining direct estimates by DAM or the level of aggregation at which the survey is representative.

In this code, an RDS file of a survey (07_data_JAM.rds) is read, and the transmute() function is used to select and transform the variables of interest.

encuesta <- readRDS("Recursos/04_FH_Arcosin/07_encuesta.rds") %>% 
   mutate(dam = substr(dam2,1,2))

The code is conducting survey data analysis using the survey package in R. Initially, an object design is created as a survey design using the as_survey_design() function from the srvyr package. This design includes primary sampling unit identifiers (upm), weights (fep), strata (estrato), and survey data (encuesta). Subsequently, the design object is grouped by the variable “Aggregate,” and the mean of the variable “pobreza” with a confidence interval for the entire population is calculated using the survey_mean() function. The result is stored in the directoDam object and displayed in a table.

library(survey)
library(srvyr)
options(survey.lonely.psu = "adjust")

diseno <-
  as_survey_design(
    ids = upm,
    weights = fep,
    strata = estrato,
    nest = TRUE,
    .data = encuesta
  )
directoDam <- diseno %>% 
    group_by(dam) %>% 
  summarise(
    theta_dir = survey_mean(pobreza, vartype = c("ci"))
    )
tba(directoDam %>% slice(1:10))
dam theta_dir theta_dir_low theta_dir_upp
01 0.9926 0.9848 1.0004
02 0.9760 0.9638 0.9881
03 0.9191 0.8566 0.9816
04 0.8160 0.7742 0.8578
05 0.7486 0.6943 0.8029
06 0.9419 0.8993 0.9844
07 0.8019 0.7076 0.8962
08 0.5933 0.4904 0.6961
09 0.8628 0.8167 0.9090
10 0.7379 0.6348 0.8410
  1. Carry out the consolidation of information obtained in 1 and 2.
temp <- estimacionesPre %>%
  inner_join(N_dam_pp ) %>% 
  inner_join(directoDam )

tba(temp %>% slice(1:10))
dam2 theta_pred dam total_pp dam_pp theta_dir theta_dir_low theta_dir_upp
0102 0.9832 01 26458 88799 0.9926 0.9848 1.0004
0103 0.8759 01 26591 88799 0.9926 0.9848 1.0004
0202 0.9391 02 43700 573065 0.9760 0.9638 0.9881
0203 0.8117 02 35309 573065 0.9760 0.9638 0.9881
0205 0.9645 02 35694 573065 0.9760 0.9638 0.9881
0212 0.8303 02 58509 573065 0.9760 0.9638 0.9881
0301 0.9419 03 41762 93896 0.9191 0.8566 0.9816
0302 0.9109 03 52134 93896 0.9191 0.8566 0.9816
0401 0.8079 04 49909 81732 0.8160 0.7742 0.8578
0402 0.8307 04 31823 81732 0.8160 0.7742 0.8578
  1. With the organized information, calculate the weights for the Benchmark
R_dam2 <- temp %>% group_by(dam) %>% 
  summarise(
  R_dam_RB = unique(theta_dir) / sum((total_pp  / dam_pp) * theta_pred)
) %>%
  left_join(directoDam, by = "dam")

tba(R_dam2 %>% arrange(desc(R_dam_RB)))
dam R_dam_RB theta_dir theta_dir_low theta_dir_upp
13 1.1793 0.6461 0.5226 0.7696
02 1.1158 0.9760 0.9638 0.9881
01 1.0996 0.9926 0.9848 1.0004
09 1.0669 0.8628 0.8167 0.9090
14 1.0550 0.8792 0.8376 0.9207
10 1.0465 0.7379 0.6348 0.8410
05 1.0098 0.7486 0.6943 0.8029
06 1.0031 0.9419 0.8993 0.9844
04 0.9991 0.8160 0.7742 0.8578
08 0.9973 0.5933 0.4904 0.6961
03 0.9940 0.9191 0.8566 0.9816
07 0.9474 0.8019 0.7076 0.8962
11 0.8631 0.4277 0.3056 0.5498
12 0.0480 0.0292 0.0120 0.0463

calculating the weights for each domain.

pesos <- temp %>% 
  mutate(W_i = total_pp / dam_pp) %>% 
  select(dam2, W_i)
tba(pesos %>% slice(1:10))
dam2 W_i
0102 0.2980
0103 0.2995
0202 0.0763
0203 0.0616
0205 0.0623
0212 0.1021
0301 0.4448
0302 0.5552
0401 0.6106
0402 0.3894
  1. Perform FH Benchmark Estimation
estimacionesBench <- estimacionesPre %>%
  left_join(R_dam2, by = c("dam")) %>%
  mutate(theta_pred_RBench = R_dam_RB * theta_pred) %>%
  left_join(pesos) %>% 
  select(dam, dam2, W_i, theta_pred, theta_pred_RBench)  

  tba(estimacionesBench %>% slice(1:10))
dam dam2 W_i theta_pred theta_pred_RBench
01 0102 0.2980 0.9832 1.0811
01 0103 0.2995 0.8759 0.9632
02 0202 0.0763 0.9391 1.0478
02 0203 0.0616 0.8117 0.9057
02 0205 0.0623 0.9645 1.0762
02 0212 0.1021 0.8303 0.9264
03 0301 0.4448 0.9419 0.9363
03 0302 0.5552 0.9109 0.9054
04 0401 0.6106 0.8079 0.8072
04 0402 0.3894 0.8307 0.8299
  1. Validation: FH Estimation with Benchmark
estimacionesBench %>% group_by(dam) %>%
  summarise(theta_reg_RB = sum(W_i * theta_pred_RBench)) %>%
  left_join(directoDam, by = "dam") %>% 
  tba()
dam theta_reg_RB theta_dir theta_dir_low theta_dir_upp
01 0.9926 0.9926 0.9848 1.0004
02 0.9760 0.9760 0.9638 0.9881
03 0.9191 0.9191 0.8566 0.9816
04 0.8160 0.8160 0.7742 0.8578
05 0.7486 0.7486 0.6943 0.8029
06 0.9419 0.9419 0.8993 0.9844
07 0.8019 0.8019 0.7076 0.8962
08 0.5933 0.5933 0.4904 0.6961
09 0.8628 0.8628 0.8167 0.9090
10 0.7379 0.7379 0.6348 0.8410
11 0.4277 0.4277 0.3056 0.5498
12 0.0292 0.0292 0.0120 0.0463
13 0.6461 0.6461 0.5226 0.7696
14 0.8792 0.8792 0.8376 0.9207

3.4.1 Results Validation

This code conducts data analysis and visualization using the ggplot2 library. Specifically, it merges two data frames using the left_join() function, groups the data by the dam variable, and performs some operations to transform the thetaFH and theta_pred_RBench variables. Afterwards, it utilizes the gather() function to organize the data in a long format and visualizes it with ggplot().

The resulting visualization displays points in different shapes and colors, representing various estimation methods. Additionally, it includes two dashed lines that depict the upper and lower confidence intervals for the observed values in the theta_dir variable.

temp <- estimacionesBench %>% left_join(
bind_rows(
data_dir %>% select(dam2, thetaFH = pred_arcoseno),
data_syn %>% select(dam2, thetaFH = pred_arcoseno))) %>% 
group_by(dam) %>% 
summarise(thetaFH = sum(W_i * theta_pred),
          theta_RBench = sum(W_i * theta_pred_RBench)
          ) %>%   
left_join(directoDam, by = "dam")  %>% 
mutate(id = 1:n())

temp %<>% gather(key = "Metodo",value = "Estimacion",
                -id, -dam, -theta_dir_upp, -theta_dir_low)

p1 <- ggplot(data = temp, aes(x = id, y = Estimacion, shape = Metodo)) +
  geom_point(aes(color = Metodo), size = 2) +
  geom_line(aes(y = theta_dir_low), linetype  = 2) +
  geom_line(aes(y = theta_dir_upp),  linetype  = 2) +
  theme_bw(20) + 
  scale_x_continuous(breaks = temp$id,
    labels =  temp$dam) +
  labs(y = "", x = "")

# ggsave(plot = p1,
#        filename = "Recursos/04_FH_Arcosin/08_validar_bench.png",width = 16,height = 12)
p1