#Data Generation Here we generate data where residuals only come from random noise and thus shouldn’t be identified as biased by IAT Instead of creating a whole dataset with some random variables and some variables correlated with others and then assign genders randomly to each observation, we want to acknowledge the fact that different genders in real world do have different strengths. ##strength
set.seed(123)
f_e=data.frame(strength=rnorm(256,62,6),label='female')
m_e=data.frame(strength=rnorm(544,68,10),label='male') #males have higher strength on average
Female employees have average of 62 for strength; males have 68. More males than females. *Note: householdIncome treated as a random variable since it’s what families the potential employees were born into. The race and edu variables will be generated based on householdIncome to represent correlations. ##Background variables
set.seed(123)
f_e$householdIncome=rnorm(256,80000,15000)
f_e$age=rdunif(256,b=55,a=18)
m_e$householdIncome=rnorm(544,80000,10000)
m_e$age=rdunif(544,b=60,a=16)
set.seed(123)
fedu_u=rdunif(256,b=2,a=-2)
f_e$edu=rpois(256,lambda=f_e$householdIncome*0.0001+fedu_u) #poisson dist. since discrete
medu_u=rdunif(544,b=2,a=-2)
m_e$edu=rpois(544,lambda=m_e$householdIncome*0.00008+medu_u)
set.seed(123)
frace_u=rdunif(256,b=3,a=-3)
flambda=f_e$householdIncome*0.000003+frace_u
for(i in 1:length(flambda)){
if(flambda[i]<0){
flambda[i]=0
}
}
f_e$race=rpois(256,lambda=flambda) #poisson distribution since discrete
mrace_u=rdunif(544,b=3,a=-3)
mlambda=m_e$householdIncome*0.000005+mrace_u
for(i in 1:length(mlambda)){
if(mlambda[i]<0){
mlambda[i]=0
}
}
m_e$race=rpois(544,lambda=mlambda)
##proxy: height Unbiased: females and males have the same functions for height, so no “direct” relationships between them.
set.seed(123)
f_u=rnorm(256,0,4)
for (i in 1:nrow(f_e)){
f_e$height[i]=45+f_e$strength[i]^2*0.002+f_e$age[i]*0.12+f_e$race[i]^(0.5)*0.001+f_u[i]
}
m_u=rnorm(544,0,4)
for (i in 1:nrow(m_e)){
m_e$height[i]=45+m_e$strength[i]^2*0.002+m_e$age[i]*0.12+m_e$race[i]^(0.5)*0.001+m_u[i]
}
employee=rbind(f_e,m_e)
#IAT Expected: although the true _1 is zero, we expect gender to be significant, due to a failure to control for other proxies through which gender correlates with height.
cut_employee = employee %>% mutate(g_binary = if_else(label == 'female',1,0))
first_step=lm(height~strength, data=cut_employee)
summary(first_step)
Call:
lm(formula = height ~ strength, data = cut_employee)
Residuals:
Min 1Q Median 3Q Max
-3.4581 -1.5845 -0.0357 1.3564 7.1469
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.579618 0.495951 29.40 <2e-16 ***
strength 0.662335 0.007419 89.28 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.953 on 798 degrees of freedom
Multiple R-squared: 0.909, Adjusted R-squared: 0.9089
F-statistic: 7970 on 1 and 798 DF, p-value: < 2.2e-16
cut_employee$resid=first_step$residuals
second_step=lm(resid~g_binary, data=cut_employee)
summary(second_step)
Call:
lm(formula = resid ~ g_binary, data = cut_employee)
Residuals:
Min 1Q Median 3Q Max
-4.6969 -1.4448 0.0148 1.3200 5.7575
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.65383 0.07308 -8.947 <2e-16 ***
g_binary 2.04320 0.12918 15.816 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.704 on 798 degrees of freedom
Multiple R-squared: 0.2387, Adjusted R-squared: 0.2377
F-statistic: 250.2 on 1 and 798 DF, p-value: < 2.2e-16
Indeed, the coefficient between gender and height is deemed as significant by the IAT.
#MLR Expected: g_binary is insignificant since true _1 is zero.
mlr_employee = employee %>% mutate(g_binary = if_else(label == 'female',1,0)) %>% select(-label)
mlr=lm(height~., data=mlr_employee)
summary(mlr)
Call:
lm(formula = height ~ ., data = mlr_employee)
Residuals:
Min 1Q Median 3Q Max
-1.44492 -0.39415 -0.04959 0.34889 2.91220
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.123e+00 2.095e-01 19.677 <2e-16 ***
strength 6.875e-01 2.455e-03 279.980 <2e-16 ***
householdIncome 4.367e-05 1.960e-06 22.278 <2e-16 ***
age 1.200e-01 1.693e-03 70.853 <2e-16 ***
edu 1.067e-02 6.908e-03 1.545 0.123
race 6.688e-03 1.315e-02 0.509 0.611
g_binary 2.418e+00 4.869e-02 49.661 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5946 on 793 degrees of freedom
Multiple R-squared: 0.9916, Adjusted R-squared: 0.9916
F-statistic: 1.564e+04 on 6 and 793 DF, p-value: < 2.2e-16
avPlots(mlr)
#GLASSO
gl_employee=mlr_employee
mat=as.matrix(gl_employee) #nxp matrix
mat=scale(mat)
varCov=var(mat) #variance-covariance matrix (non-sparse)
lambdas = seq(0.05,0.33,0.025)
lambdass = c(0.001, lambdas,1000)
for (i in 1:length(lambdass)){
glasso_m=glasso(varCov, rho = lambdass[i])
rownames(glasso_m$wi)=colnames(gl_employee)
colnames(glasso_m$wi)=colnames(gl_employee)
gl=graph_from_adjacency_matrix(glasso_m$wi, weighted=TRUE, mode="upper", diag=FALSE,add.rownames="code")
set.seed(123)
coords <-layout.reingold.tilford(gl)
plot(gl, layout = coords)
}
lambdass
[1] 0.001 0.050 0.075 0.100 0.125 0.150 0.175 0.200
[9] 0.225 0.250 0.275 0.300 0.325 1000.000