#Data Generation ##strength
set.seed(123)
f_e=data.frame(strength=rnorm(256,62,6),label='female')
m_e=data.frame(strength=rnorm(544,68,10),label='male') #males have higher strength on average
##Background variables
set.seed(123)
f_e$householdIncome=rnorm(256,80000,15000)
f_e$age=rdunif(256,b=55,a=18)
m_e$householdIncome=rnorm(544,80000,10000)
m_e$age=rdunif(544,b=60,a=16)
set.seed(123)
fedu_u=rdunif(256,b=2,a=-2)
f_e$edu=rpois(256,lambda=f_e$householdIncome*0.0001+fedu_u) #poisson dist. since discrete
medu_u=rdunif(544,b=2,a=-2)
m_e$edu=rpois(544,lambda=m_e$householdIncome*0.00008+medu_u)
7 = college degree females: compared to male group, more around or above college due to stricter self-selection affected by female mentality males: mostly around high school due to looser self-selection affected by male mentality error terms characterize employees’ self-made merit
set.seed(123)
frace_u=rdunif(256,b=3,a=-3)
flambda=f_e$householdIncome*0.000003+frace_u
for(i in 1:length(flambda)){
if(flambda[i]<0){
flambda[i]=0
}
}
f_e$race=rpois(256,lambda=flambda) #poisson distribution since discrete
mrace_u=rdunif(544,b=3,a=-3)
mlambda=m_e$householdIncome*0.000005+mrace_u
for(i in 1:length(mlambda)){
if(mlambda[i]<0){
mlambda[i]=0
}
}
m_e$race=rpois(544,lambda=mlambda)
1: white, 2: black, 3: Latinx, 4: Asian Mostly white in both groups. ##proxy: height By common sense, height is correlated with strength, age, race. Biased: females and males have different functions for height.
set.seed(123)
f_u=rnorm(256,0,4)
for (i in 1:nrow(f_e)){
f_e$height[i]=45+f_e$strength[i]^2*0.002+f_e$age[i]*0.12+f_e$race[i]^(0.5)*0.001+f_u[i]
}
m_u=rnorm(544,0,4)
for (i in 1:nrow(m_e)){
m_e$height[i]=50+m_e$strength[i]^2*0.003+m_e$age[i]*0.11+m_e$race[i]^(0.5)*0.001+m_u[i]
}
employee=rbind(f_e,m_e)
#IAT Expected: g_binary in second step is significant since true _1 is not zero.
cut_employee = employee %>% mutate(g_binary = if_else(label == 'female',1,0))
first_step=lm(height~strength, data=cut_employee)
summary(first_step)
Call:
lm(formula = height ~ strength, data = cut_employee)
Residuals:
Min 1Q Median 3Q Max
-6.1042 -2.5862 0.4193 2.5013 8.2949
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.21384 0.81208 5.189 2.68e-07 ***
strength 0.91522 0.01215 75.339 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.199 on 798 degrees of freedom
Multiple R-squared: 0.8767, Adjusted R-squared: 0.8766
F-statistic: 5676 on 1 and 798 DF, p-value: < 2.2e-16
cut_employee$resid=first_step$residuals
second_step=lm(resid~g_binary, data=cut_employee)
summary(second_step)
Call:
lm(formula = resid ~ g_binary, data = cut_employee)
Residuals:
Min 1Q Median 3Q Max
-3.6131 -1.3449 -0.1628 1.2295 6.4535
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.84134 0.07436 24.76 <2e-16 ***
g_binary -5.75418 0.13145 -43.78 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.734 on 798 degrees of freedom
Multiple R-squared: 0.706, Adjusted R-squared: 0.7056
F-statistic: 1916 on 1 and 798 DF, p-value: < 2.2e-16
_0 is 0.91522, and _1 is -5.75418 (will be compared to MLR later).
#MLR Expected: different _0, _1; coefficient for g_binary is significant
mlr_employee = employee %>% mutate(g_binary = if_else(label == 'female',1,0)) %>% select(-label)
mlr=lm(height~., data=mlr_employee)
summary(mlr)
Call:
lm(formula = height ~ ., data = mlr_employee)
Residuals:
Min 1Q Median 3Q Max
-0.89707 -0.27005 -0.07324 0.14183 2.16125
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.094e+00 1.484e-01 47.819 <2e-16 ***
strength 8.124e-01 1.738e-03 467.289 <2e-16 ***
householdIncome 2.068e-05 1.388e-06 14.897 <2e-16 ***
age 1.137e-01 1.199e-03 94.880 <2e-16 ***
edu 6.245e-03 4.891e-03 1.277 0.202
race -1.317e-03 9.309e-03 -0.141 0.888
g_binary -6.188e+00 3.447e-02 -179.513 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.421 on 793 degrees of freedom
Multiple R-squared: 0.9979, Adjusted R-squared: 0.9979
F-statistic: 6.214e+04 on 6 and 793 DF, p-value: < 2.2e-16
_0 is 0.8124, and _1 is -6.188.
#GLASSO
gl_employee=mlr_employee
mat=as.matrix(gl_employee) #nxp matrix
mat=scale(mat)
varCov=var(mat) #variance-covariance matrix (non-sparse)
lambdas = seq(0.05,0.33,0.025)
lambdass = c(0.001, lambdas,1000)
for (i in 1:length(lambdass)){
glasso_m=glasso(varCov, rho = lambdass[i])
rownames(glasso_m$wi)=colnames(gl_employee)
colnames(glasso_m$wi)=colnames(gl_employee)
gl=graph_from_adjacency_matrix(glasso_m$wi, weighted=TRUE, mode="upper", diag=FALSE,add.rownames="code")
set.seed(123)
coords <-layout.reingold.tilford(gl)
plot(gl, layout = coords)
}
lambdass
[1] 0.001 0.050 0.075 0.100 0.125 0.150 0.175 0.200
[9] 0.225 0.250 0.275 0.300 0.325 1000.000