#Data Generation Here we generate data where residuals only come from random noise and thus shouldn’t be identified as biased by IAT Instead of creating a whole dataset with some random variables and some variables correlated with others and then assign genders randomly to each observation, we want to acknowledge the fact that different genders in real world do have different strengths. ##strength

set.seed(123)
f_e=data.frame(strength=rnorm(256,62,6),label='female')
m_e=data.frame(strength=rnorm(544,68,10),label='male') #males have higher strength on average

Female employees have average of 62 for strength; males have 68. More males than females. *Note: householdIncome treated as a random variable since it’s what families the potential employees were born into. The race and edu variables will be generated based on householdIncome to represent correlations. ##Background variables

set.seed(123)
f_e$householdIncome=rnorm(256,80000,15000)
f_e$age=rdunif(256,b=55,a=18)
m_e$householdIncome=rnorm(544,80000,10000)
m_e$age=rdunif(544,b=60,a=16)
set.seed(123)
fedu_u=rdunif(256,b=2,a=-2)
f_e$edu=rpois(256,lambda=f_e$householdIncome*0.0001+fedu_u) #poisson dist. since discrete
medu_u=rdunif(544,b=2,a=-2)
m_e$edu=rpois(544,lambda=m_e$householdIncome*0.00008+medu_u)
set.seed(123)
frace_u=rdunif(256,b=3,a=-3)
flambda=f_e$householdIncome*0.000003+frace_u
for(i in 1:length(flambda)){
  if(flambda[i]<0){
    flambda[i]=0
  }
}
f_e$race=rpois(256,lambda=flambda) #poisson distribution since discrete
mrace_u=rdunif(544,b=3,a=-3)
mlambda=m_e$householdIncome*0.000005+mrace_u
for(i in 1:length(mlambda)){
  if(mlambda[i]<0){
    mlambda[i]=0
  }
}
m_e$race=rpois(544,lambda=mlambda)

##proxy: height Unbiased: females and males have the same functions for height, so no “direct” relationships between them.

set.seed(123)
f_u=rnorm(256,0,4)
for (i in 1:nrow(f_e)){
  f_e$height[i]=45+f_e$strength[i]^2*0.002+f_e$age[i]*0.12+f_e$race[i]^(0.5)*0.001+f_u[i]
}
m_u=rnorm(544,0,4)
for (i in 1:nrow(m_e)){
  m_e$height[i]=45+m_e$strength[i]^2*0.002+m_e$age[i]*0.12+m_e$race[i]^(0.5)*0.001+m_u[i]
}
employee=rbind(f_e,m_e)

#IAT Expected: although the true _1 is zero, we expect gender to be significant, due to a failure to control for other proxies through which gender correlates with height.

cut_employee = employee %>% mutate(g_binary = if_else(label == 'female',1,0))
first_step=lm(height~strength, data=cut_employee)
summary(first_step)

Call:
lm(formula = height ~ strength, data = cut_employee)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.4581 -1.5845 -0.0357  1.3564  7.1469 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 14.579618   0.495951   29.40   <2e-16 ***
strength     0.662335   0.007419   89.28   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.953 on 798 degrees of freedom
Multiple R-squared:  0.909, Adjusted R-squared:  0.9089 
F-statistic:  7970 on 1 and 798 DF,  p-value: < 2.2e-16
cut_employee$resid=first_step$residuals
second_step=lm(resid~g_binary, data=cut_employee)
summary(second_step)

Call:
lm(formula = resid ~ g_binary, data = cut_employee)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.6969 -1.4448  0.0148  1.3200  5.7575 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.65383    0.07308  -8.947   <2e-16 ***
g_binary     2.04320    0.12918  15.816   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.704 on 798 degrees of freedom
Multiple R-squared:  0.2387,    Adjusted R-squared:  0.2377 
F-statistic: 250.2 on 1 and 798 DF,  p-value: < 2.2e-16

Indeed, the coefficient between gender and height is deemed as significant by the IAT.

#MLR Expected: g_binary is insignificant since true _1 is zero.

mlr_employee = employee %>% mutate(g_binary = if_else(label == 'female',1,0)) %>% select(-label)
mlr=lm(height~., data=mlr_employee)
summary(mlr)

Call:
lm(formula = height ~ ., data = mlr_employee)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.44492 -0.39415 -0.04959  0.34889  2.91220 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.123e+00  2.095e-01  19.677   <2e-16 ***
strength        6.875e-01  2.455e-03 279.980   <2e-16 ***
householdIncome 4.367e-05  1.960e-06  22.278   <2e-16 ***
age             1.200e-01  1.693e-03  70.853   <2e-16 ***
edu             1.067e-02  6.908e-03   1.545    0.123    
race            6.688e-03  1.315e-02   0.509    0.611    
g_binary        2.418e+00  4.869e-02  49.661   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5946 on 793 degrees of freedom
Multiple R-squared:  0.9916,    Adjusted R-squared:  0.9916 
F-statistic: 1.564e+04 on 6 and 793 DF,  p-value: < 2.2e-16
avPlots(mlr)

#GLASSO

gl_employee=mlr_employee
mat=as.matrix(gl_employee) #nxp matrix
mat=scale(mat)
varCov=var(mat) #variance-covariance matrix (non-sparse)
lambdas = seq(0.05,0.33,0.025)
lambdass = c(0.001, lambdas,1000)
for (i in 1:length(lambdass)){
  glasso_m=glasso(varCov, rho = lambdass[i])
  rownames(glasso_m$wi)=colnames(gl_employee)
  colnames(glasso_m$wi)=colnames(gl_employee)
  gl=graph_from_adjacency_matrix(glasso_m$wi, weighted=TRUE, mode="upper", diag=FALSE,add.rownames="code")
  set.seed(123)
  coords <-layout.reingold.tilford(gl)
  plot(gl, layout = coords)
}

lambdass
 [1]    0.001    0.050    0.075    0.100    0.125    0.150    0.175    0.200
 [9]    0.225    0.250    0.275    0.300    0.325 1000.000