3.3 Area models - ArcSin transformation.

In its most basic conception, the Fay-Herriot model is a linear combination of covariates. However, the result of this combination can take values that fall outside the acceptable range for a proportion; that is, generally, the Fay-Herriot estimator \(\theta \in \mathbb{R}\), whereas the direct estimator \(\theta \in (0,1)\). The arcsine transformation is given by:

\[ \hat{z}_d = \arcsin\left( \sqrt{ \hat{\theta}_d} \right) \]

where

\[ Var\left( \hat{z}_d \right) = \frac{\widehat{DEFF}_d}{4\times n_d} = \frac{1}{4\times n_{d, \text{effective}} } \]

The Fay-Herriot model is defined as follows:

\[ \begin{align*} Z_d \mid \mu_d,\sigma^2_d & \sim N(\mu_d, \sigma^2_d)\\ \mu_d & = \boldsymbol{x}^{T}_{d}\boldsymbol{\beta} + u_d \\ \theta_d & = \left(\sin(\mu_d)\right)^2 \end{align*} \]

where \(u_d \sim N(0 , \sigma^2)\).

Let the prior distributions for \(\boldsymbol{\beta}\) and \(\sigma_{u}^{2}\) be given by:

\[ \begin{align*} \boldsymbol{\beta} \sim N\left(0,1000 \right)\\ \sigma_{u}^{2} \sim \text{IG}\left(0.0001,0.0001\right) \end{align*} \]

3.3.1 Estimation procedure

Reading the database that resulted in the previous step and selecting the columns of interest

library(tidyverse)
library(magrittr)

base_FH <- readRDS('Recursos/04_FH_Arcosin/01_base_FH.Rds') %>% 
transmute(dam2,                            ## id dominios
          pobreza,
          T_pobreza = asin(sqrt(pobreza)),  ## creando zd
          n_effec = n_eff_FGV,              ## n efectivo
          varhat = 1/(4*n_effec)            ## varianza para zd
)

Joining the two databases.

statelevel_predictors_df <-
  readRDS('Recursos/03_FH_normal/02_statelevel_predictors_dam.rds') %>%
  mutate(id_order = 1:n())
base_FH <-
  full_join(base_FH, statelevel_predictors_df, by = "dam2")
tba(base_FH[, 1:8] %>% head(10))
dam2 pobreza T_pobreza n_effec varhat area1 sex2 age2
0101 NA NA NA NA 1.0000 0.5087 0.2694
0102 0.9836 1.4422 1029.570 2.000000e-04 1.0000 0.4754 0.2857
0103 1.0000 1.5708 0.000 2.226882e+57 1.0000 0.5037 0.3095
0201 NA NA NA NA 0.5147 0.5060 0.2962
0202 0.9391 1.3215 5881.354 0.000000e+00 0.9986 0.5376 0.2625
0203 0.8117 1.1219 6965.029 0.000000e+00 0.9754 0.5432 0.2454
0204 NA NA NA NA 1.0000 0.5300 0.3151
0205 0.9646 1.3814 11790.358 0.000000e+00 1.0000 0.5182 0.3057
0206 NA NA NA NA 1.0000 0.5157 0.3192
0207 NA NA NA NA 1.0000 0.5097 0.3099

Selecting the covariates for the model.

names_cov <-
    c(
       "ODDJOB","WORKED",
      "stable_lights_mean",
      "accessibility_mean",
      "urban.coverfraction_sum"
    )

3.3.2 Preparing Inputs for STAN

  1. Splitting the database into observed and unobserved domains

Observed domains.

data_dir <- base_FH %>% filter(!is.na(T_pobreza))

Unobserved domains.

data_syn <-
  base_FH %>% anti_join(data_dir %>% select(dam2))
  1. Defining the fixed-effects matrix.
## Observed domains
Xdat <- cbind(inter = 1,data_dir[,names_cov])

## Unobserved domains
Xs <-  cbind(inter = 1,data_syn[,names_cov])
  1. Creating a parameter list for STAN.
sample_data <- list(
  N1 = nrow(Xdat),       # Observed.
  N2 = nrow(Xs),         # Unobserved.
  p  = ncol(Xdat),       # Number of regressors.
  X  = as.matrix(Xdat),  # Observed Covariates.
  Xs = as.matrix(Xs),    # Unobserved Covariates
  y  = as.numeric(data_dir$T_pobreza),
  sigma_e = sqrt(data_dir$varhat)
)
  1. Compiling the model in STAN.
library(rstan)
fit_FH_arcoseno <- "Recursos/04_FH_Arcosin/modelosStan/15FH_arcsin_normal.stan"
options(mc.cores = parallel::detectCores())
rstan::rstan_options(auto_write = TRUE) # speed up running time 
model_FH_arcoseno <- stan(
  file = fit_FH_arcoseno,  
  data = sample_data,   
  verbose = FALSE,
  warmup = 2500,         
  iter = 3000,            
  cores = 4              
)
saveRDS(model_FH_arcoseno,
        "Recursos/04_FH_Arcosin/02_model_FH_arcoseno.rds")
model_FH_arcoseno <- readRDS("Recursos/04_FH_Arcosin/02_model_FH_arcoseno.rds")

3.3.2.1 Model results for the observed domains.

library(bayesplot)
library(patchwork)
library(posterior)

y_pred_B <- as.array(model_FH_arcoseno, pars = "theta") %>% 
  as_draws_matrix()
rowsrandom <- sample(nrow(y_pred_B), 100)

y_pred2 <- y_pred_B[rowsrandom, ]
ppc_dens_overlay(y = as.numeric(data_dir$pobreza), y_pred2)

Graphical analysis of the convergence of \(\sigma^2_u\) chains.

posterior_sigma2_u <- as.array(model_FH_arcoseno, pars = "sigma2_u")
(mcmc_dens_chains(posterior_sigma2_u) +
    mcmc_areas(posterior_sigma2_u) ) / 
  mcmc_trace(posterior_sigma2_u)

To validate the convergence of all chains, the R-hat is used.

parametros <- summary(model_FH_arcoseno, pars =  c("theta", "theta_pred") )$summary %>%
  data.frame()
p1 <- mcmc_rhat(parametros$Rhat)
p1

Estimation of the FH of poverty in the observed domains.

theta_FH <-   summary(model_FH_arcoseno,pars =  "theta")$summary %>%
  data.frame()
data_dir %<>% mutate(pred_arcoseno = theta_FH$mean, 
                     pred_arcoseno_EE = theta_FH$sd,
                     Cv_pred = pred_arcoseno_EE/pred_arcoseno)

Estimation of the FH of poverty in the NOT observed domains.

theta_FH_pred <- summary(model_FH_arcoseno,pars =  "theta_pred")$summary %>%
  data.frame()
data_syn <- data_syn %>% 
  mutate(pred_arcoseno = theta_FH_pred$mean,
         pred_arcoseno_EE = theta_FH_pred$sd,
         Cv_pred = pred_arcoseno_EE/pred_arcoseno)
dam2 pobreza pred_arcoseno pred_arcoseno_EE Cv_pred
0101 NA 0.8630 0.1463 0.1696
0201 NA 0.8797 0.1426 0.1621
0204 NA 0.8412 0.1592 0.1893
0206 NA 0.8980 0.1296 0.1443
0207 NA 0.8958 0.1321 0.1475
0208 NA 0.7818 0.1893 0.2421
0209 NA 0.8910 0.1313 0.1474
0210 NA 0.8990 0.1290 0.1435
0211 NA 0.8872 0.1389 0.1566
0502 NA 0.7649 0.1889 0.2470

consolidating the bases of estimates for observed and UNobserved domains.

estimacionesPre <- bind_rows(data_dir, data_syn) %>% 
  select(dam2, theta_pred = pred_arcoseno) %>% 
  mutate(dam = substr(dam2,1,2))