I dislike R, unlike some other softwares I use, such as SPSS and Stata. It’s extremely error prone. But it’s free, and can do almost everything (e.g., a few things Stata cannot do and a lot of things SPSS/AMOS cannot do). As always, I will update whenever I learn something new.
Before going any further
I recommend to download : https://osf.io/hiuzk/. That’s the subset of the General Social Survey data I will be using for my analyses below. So, if you want to follow, you should get it. And do not forget to run the entire syntax displayed in Create and modify variables.
Always remember this
R is case sensitive and does not handle missing data. The files must be loaded. You must use \ instead of \ or it will fail. The function read.csv() below, as you guessed, allows you to read .csv, but you must then save it into an object, whatever you call it. Let’s say “entiredata”. By importing a .csv file, you don’t need to use data.frame() because R automatically store it as a data frame. If the file is too large, keep only the variables you need by using select() and then write.table() to save in a new file.
setwd("c:\\Users\\MengHuTheGodOfDestruction\\location of your .csv file")
entiredata<-read.csv("GSSsubset.csv") # GSS data 1974-2012
#d<- select(entiredata, race, educ, wordsum)
#write.table(d, file = "GSSsubset2.csv", sep = ",", col.names = T)
Publish at rpubs or save work environment as html page
If you ever need to publish your work at Rpubs, run the code in Rstudio “Source” panel within chunks to display all the results, then save the working environment as html file using “Knit to HTML” in “Knit” tab (you must install knitr package). Make sure to get the Rmd file using “Save as” in “File” tab.
library(rsconnect)
accounts()
file.exists('C:\\Users\\mh198\\OneDrive\\Documents\\vocab_corrected_code.html')
rmarkdown::render('C:\\Users\\mh198\\OneDrive\\Documents\\vocab_corrected_code.Rmd')
rsconnect::rpubsUpload(
title = 'A new, open source English vocabulary test (corrected)',
contentFile = 'C:\\Users\\mh198\\OneDrive\\Documents\\vocab_corrected_code.html',
originalDoc = 'C:\\Users\\mh198\\OneDrive\\Documents\\vocab_corrected_code.Rmd'
)
This will give you a rpubs link where you need to proceed to step 2 before publishing.
Useful functions (to get your feet wet)
Sys.setenv(LANG = "en") # make R environment in english
help(table) # request the webpage help for table() function
View(d) # open the data window of the object "d"
head(d, n=10) # display ten first rows in all vars
tail(d, n=10) # display ten last rows in all vars
require(dplyr) # get select() function to keep the variables you wish
dk<- select(d, worda, wordb, wordc, wordd, worde, wordf, wordg, wordh, wordi, wordj, wordsum, bw1) # keep the variables you want (useful when you have too many variables)
dk = dk[complete.cases(dk),] # keep only cases with no missing data on all variables in dk
d$bw1<-NULL # drop variables individually
d<- subset(entiredata, age<70 & age>=18) # retain only cases having age<70 and age>=18
xtabs(~degree+sex+race, data=d, subset=wordsum<11) # "do if" not missing data on wordsum because all values in wordsum are below 11
(56 + 31 + 56 + 8 + 32) / 5 # You get 36.6, but I usually do this kind of things in EXCEL or Kingsoft Spreadsheets
Create and modify variables
Please, run the entire syntax in your R console. Otherwise, you will never be able to work with the regression, structural equation models, factor analysis, and item response, that I have prepared. Do not forget to check your directory path and make sure your file is converted into csv.
setwd("c:\\Users\\MengHuTheGodOfDestruction\\location of your .csv file")
entiredata<-read.csv("GSSsubset.csv")
d<- subset(entiredata, age<70 & age>=18) # retain only cases having age<70 and age>=18
require(car) # get recode() function
d$bw<- rep(NA) # generate variable with only missing values
d$bw[d$race==1] <- 1 # assign a value of "1" under the condition specified in bracket
d$bw[d$race==2] <- 0 # assign a value of "0" under the condition specified in bracket
d$bw0<- rep(NA)
d$bw0[d$year>=2000 & d$race==1 & d$hispanic==1] <- 1
d$bw0[d$year>=2000 & d$race==2 & d$hispanic==1] <- 0
d$bw00<- rep(NA)
d$bw00[d$year<2000 & d$race==1] <- 1
d$bw00[d$year<2000 & d$race==2] <- 0
d$bw1<-d$bw0 # combine the two vars, by incorporating the first var
d$bw1[is.na(d$bw0)]<-d$bw00[is.na(d$bw0)] # then the second, without Nas
d$agec<- d$age-40.62 # mean-centering of age
d$agec2<- d$agec^2 # squared term of age
d$agec3<- d$agec^3 # cubic term of age
d$bw1agec = d$bw1*d$agec
d$bw1agec2 = d$bw1*d$agec2
d$bw1agec3 = d$bw1*d$agec3
d$weight<- d$wtssall*d$oversamp # interaction variable
d$gender<- d$sex-1 # assuming sex has M=1 and F=2, gender will have M=0 and F=1
d$realinc[d$realinc==0] <- NA
d$logincome<- log(d$realinc) # log of income
d$sqrtincome<- sqrt(d$realinc) # sqrt of income
d$educ[d$educ>20] <- NA # remove missing values before transformation
d$degree[d$degree>4] <- NA # remove missing values before transformation
d$educc<- d$educ-13.09 # remove NAs before data transformation
d$degreec<- d$degree-1.39 # remove NAs before data transformation
d$sibs[d$sibs==-1] <- NA
d$sibs[d$sibs>37] <- NA
d$cohort[d$cohort==9999] <- NA
d$wordsum[d$wordsum==-1] <- NA
d$wordsum[d$wordsum==99] <- NA
d$worda[d$worda<0] <- NA
d$wordb[d$wordb<0] <- NA
d$wordc[d$wordc<0] <- NA
d$wordd[d$wordd<0] <- NA
d$worde[d$worde<0] <- NA
d$wordf[d$wordf<0] <- NA
d$wordg[d$wordg<0] <- NA
d$wordh[d$wordh<0] <- NA
d$wordi[d$wordi<0] <- NA
d$wordj[d$wordj<0] <- NA
d$worda[d$worda==9] <- 0
d$wordb[d$wordb==9] <- 0
d$wordc[d$wordc==9] <- 0
d$wordd[d$wordd==9] <- 0
d$worde[d$worde==9] <- 0
d$wordf[d$wordf==9] <- 0
d$wordg[d$wordg==9] <- 0
d$wordh[d$wordh==9] <- 0
d$wordi[d$wordi==9] <- 0
d$wordj[d$wordj==9] <- 0
d$word = d$worda + d$wordb + d$wordc + d$wordd + d$worde + d$wordf + d$wordg + d$wordh + d$wordi + d$wordj
d<- d %>% mutate(d, cohort6 = case_when(
cohort>=1905 & cohort<=1928~0,
cohort>=1929 & cohort<=1943~1,
cohort>=1944 & cohort<=1953~2,
cohort>=1954 & cohort<=1962~3,
cohort>=1963 & cohort<=1973~4,
cohort>=1974 & cohort<=1994~5,
cohort>1994~NA_real_))
d$cohortdummy1<- as.numeric(d$cohort>=1905 & d$cohort<=1928)
d$cohortdummy2<- as.numeric(d$cohort>=1929 & d$cohort<=1943)
d$cohortdummy3<- as.numeric(d$cohort>=1944 & d$cohort<=1953)
d$cohortdummy4<- as.numeric(d$cohort>=1954 & d$cohort<=1962)
d$cohortdummy5<- as.numeric(d$cohort>=1963 & d$cohort<=1973)
d$cohortdummy6<- as.numeric(d$cohort>=1974 & d$cohort<=1994)
d$bw1C1<- d$bw1*d$cohortdummy1
d$bw1C2<- d$bw1*d$cohortdummy2
d$bw1C3<- d$bw1*d$cohortdummy3
d$bw1C4<- d$bw1*d$cohortdummy4
d$bw1C5<- d$bw1*d$cohortdummy5
d$bw1C6<- d$bw1*d$cohortdummy6
d<- d %>% mutate(d, cohort21 = case_when(
cohort>=1905 & cohort<=1915~0,
cohort>=1916 & cohort<=1920~1,
cohort>=1921 & cohort<=1925~2,
cohort>=1926 & cohort<=1929~3,
cohort>=1930 & cohort<=1933~4,
cohort>=1934 & cohort<=1937~5,
cohort>=1938 & cohort<=1941~6,
cohort>=1942 & cohort<=1944~7,
cohort>=1945 & cohort<=1947~8,
cohort>=1948 & cohort<=1950~9,
cohort>=1951 & cohort<=1953~10,
cohort>=1954 & cohort<=1956~11,
cohort>=1957 & cohort<=1959~12,
cohort>=1960 & cohort<=1962~13,
cohort>=1963 & cohort<=1965~14,
cohort>=1966 & cohort<=1968~15,
cohort>=1969 & cohort<=1972~16,
cohort>=1973 & cohort<=1976~17,
cohort>=1977 & cohort<=1980~18,
cohort>=1981 & cohort<=1985~19,
cohort>=1986 & cohort<=1994~20,
cohort>1994~NA_real_))
d$age9<- NA
d<- d %>% mutate(d, age9 = case_when(
age>=18 & age<=23~1,
age>=24 & age<=28~2,
age>=29 & age<=33~3,
age>=34 & age<=38~4,
age>=39 & age<=44~5,
age>=45 & age<=50~6,
age>=51 & age<=56~7,
age>=57 & age<=62~8,
age>=63 & age<=69~9,
TRUE~ as.numeric(age9)))
d$aged1<- as.numeric(d$age>=18 & d$age<=23)
d$aged2<- as.numeric(d$age>=24 & d$age<=28)
d$aged3<- as.numeric(d$age>=29 & d$age<=33)
d$aged4<- as.numeric(d$age>=34 & d$age<=38)
d$aged5<- as.numeric(d$age>=39 & d$age<=44)
d$aged6<- as.numeric(d$age>=45 & d$age<=50)
d$aged7<- as.numeric(d$age>=51 & d$age<=56)
d$aged8<- as.numeric(d$age>=57 & d$age<=62)
d$aged9<- as.numeric(d$age>=63 & d$age<=69)
d$bw1aged1<- d$bw1*d$aged1
d$bw1aged2<- d$bw1*d$aged2
d$bw1aged3<- d$bw1*d$aged3
d$bw1aged4<- d$bw1*d$aged4
d$bw1aged5<- d$bw1*d$aged5
d$bw1aged6<- d$bw1*d$aged6
d$bw1aged7<- d$bw1*d$aged7
d$bw1aged8<- d$bw1*d$aged8
d$bw1aged9<- d$bw1*d$aged9
The following code will attach some value labels to your categorical variables. Remember that if you do this, you cannot use it anymore for the creation of another variable; e.g., you may need bw1/race to compute Y-hat after regression.
d$bw1 <- factor(d$bw1,
levels = c(0,1),
labels = c("black", "white"))
d$race <- factor(d$race,
levels = c(1,2,3),
labels = c("white", "black", "other"))
Imputation
Let’s say you want to impute cognitive data but allow for the possibility that the test correlations could differ across race groups. First, remove cases with too few subtest data, then split the data into male whites and male blacks. You do not want background information such as ID, age, race to enter the imputation method but need them back after imputation is done. Finally, combine the separate datasets into one to restore the original datafile but with missing values filled with imputed values.
library(mice)
mwhite<- mwhite[rowSums(is.na(mwhite)) <10,] # removes cases who took less than 10 tests
impute<- data.frame(mwhite[,1:37]) # restrict the columns 1-37 containing cognitive variables so as to avoid background variables to enter imputation method
impute<- impute %>% mutate_all(as.numeric)
impute1<- mice(impute, method='pmm', m=5, maxit=50, seed=500)
impute1<- complete(impute1, 2) # save complete data and as two decimals
mblack<- mblack[rowSums(is.na(mblack)) <10,]
impute<- data.frame(mblack[,1:37])
impute<- impute %>% mutate_all(as.numeric)
impute2<- mice(impute, method='pmm', m=5, maxit=50, seed=500)
impute2<- complete(impute2, 2)
mwhite1<- cbind(impute1, wb=mwhite$wb, sex=mwhite$sex, age=mwhite$age, BY_ID_RE=mwhite$BY_ID_RE) # select background variables and put them back
mblack1<- cbind(impute2, wb=mblack$wb, sex=mblack$sex, age=mblack$age, BY_ID_RE=mblack$BY_ID_RE) # same for blacks
dm<- rbind(mwhite1, mblack1) # combine black and white data
write.table(dm, file = "TALENTimputeM.csv", sep=",", row.names = F, col.names = T)
Summary statistics
Cross-tabulation can be used with more than 2 variables.
describeBy() is particularly useful. You get N, mean, 5% trimmed mean, median, standard deviation, standard error of the mean, skewness, kurtosis, min, max, and finally “mad” which is a robust estimator of the standard deviation that is computed by a transformation of the median absolute deviation (mad) from the sample median. describe() does the same thing but without groupings.
require(dplyr)
d %>% group_by(race, sex) %>% summarise(mean=mean(age), sd=sd(age))
require(psych) # for describe() and describeBy()
word_vars <- grep("^word", names(data), value = TRUE) # descriptives for all variables starting with "word"
describeBy(data[, word_vars])
describeBy(d$degree, d$race) # summary stats of degree by race, but also works with a single variable
n = table(d$wordsum, d$degree, d$bw1) # select variables' N to be cross tabulated
addmargins(n) # N by cells and N_sums by rows and columns
addmargins(prop.table(n, margin=2)) # expressed in proportion, i.e., percentage, and with N_sums by rows and columns
with(d, table(degree, race, sex)) # another way of doing cross-tabulations (N per cells)
xtabs(~degree+bw1+gender, data=d, na.action=na.omit) # remove NA
xtabs(~degree+bw1+gender, data=d, subset=wordsum<11) # subset
by(d$wordsum, INDICES=d$educ, FUN=mean, na.rm=TRUE) # mean of wordsum by education level
mean(d$wordsum, trim=0.05, na.rm=TRUE) # mean trimmed at 5% for checking outliers, also removes these stupid NAs
shapiro.test(d$logincome) # it won't work because N is higher than 5000
Weighted means/SDs can be obtained through a survey design for either summary statistics or regression analysis. For summary stats, there are two ways:
library(survey) # get means and SEs
svy<- svydesign(data=d, id=~BY_ID_RE, strata=NULL, weights=d$BY_WTA)
svyby(~age, by = ~sex + wb, design=svy, FUN=svymean)
var<- svyby(~age, by = ~sex + wb, design=svy, FUN=svyvar) # get variance
sqrt(var) # get the "grouped" SD
library(srvyr) # get means, SEs and SDs
d %>% as_survey_design(ids = BY_ID_RE, weights = BY_WTA) %>% group_by(wb, sex) %>% summarise(MEAN = survey_mean(age), SD = survey_sd(age))
The histogram gives the distribution of wordsum by race. The boxplot displays wordsum mean score by race and by gender.
histogram(~ d$wordsum | d$race, n.label=TRUE)
boxplot(wordsum~gender*race, data=d, notch=TRUE, col=(c("darkolivegreen1","violet")), main="Meng Hu hates everything and loves nothing", xlab="Race", ylab="Wordsum scores") # x axis shows the values of the categorical variables gender and race
Scatterplots
library(car) # scatterplot function
library(gplots) # plotmeans() function
library(lattice) # xyplot() bwplot() densityplot() functions
library(ggplot2)
library(ggpubr)
ggplot( d, aes( x=d$European_ancestry, y=d$income))+
geom_point()+
stat_cor(method = "pearson") # displays cor in the plot
plot(d$wordsum, d$educ, xlab="wordsum vocabulary score", ylab="years of school", pch=20, col="gray50", frame.plot=TRUE) # you can configure the scale of x and y by using, xlim=c(0,10), ylim=c(0,20)
xyplot(d$wordsum~d$degree|d$bw1, main="title", ylab="wordsum", xlab="degree")
bwplot(d$wordsum~d$degree|d$bw1, main="title", ylab="wordsum", xlab="degree")
scatterplot(wordsum~cohort|bw1, boxplots=FALSE, smooth=TRUE, reg.line=FALSE, data=d) # reg.line=TRUE requests a regression line
densityplot(~d$logincome|d$bw1, main="title", xlab="log of real income") # just a Kernel density plot
plotmeans(wordsum ~ year, data=d, xlab="survey year", ylab="wordsum vocabulary test", n.label=FALSE) # n.label=TRUE gives you the sample size per values in the x variable, so if you have numerous values in x, don't do it...
pairs(~ wordsum + bw1 + logincome, data=d, subset=age<70, main="Meng Hu is not smart, his IQ has only 3 digits") # scatterplot matrices
Correlations
Using cor() without specifying use="" will cause you to get NA. cor() needs to be used with complete.obs (listwise) or pairwise.complete.obs. Otherwise, it will not work if there are missing data. cor() can be used on the entire data set stored in object “d”. Be careful if you have too many variables.
rcorr() produces 3 matrices, one for correlations, one for sample sizes, one for the asymptotic p-values. It uses pairwise deletion. And it automatically handles missing values.
cor.test() gives confidence intervals (only for pearson) and allows to test the type of the alternative hypothesis (against the null) which may be either “two.sided” or “less” (if the predicted correlation is negative) or “greater” (if the predicted correlation is positive).
require(Hmisc) # get rcorr() function
cor(d$wordsum, d$educ, use="pairwise.complete.obs", method="pearson") # could also use "spearman" or "kendall" but Kendall takes a damn lot of time...
rcorr(d$wordsum, d$educ, type="pearson")
cor.test(d$wordsum, d$educ, alternative="two.sided", method="pearson", conf.level=0.95)
cor.test(d$wordsum, d$educ, alternative="less", method="pearson", conf.level=0.95)
Cronbach’s alpha
require(psych) # load alpha() function, by far the best...
require(psy) # load cronbach() function
require(ltm) # load cronbach.alpha() function
require(dplyr) # get select() function
dk = select(d, worda, wordb, wordc, wordd, worde, wordf, wordg, wordh, wordi, wordj, wordsum, bw1)
dk = dk[complete.cases(dk),] # keep only cases with no missing data on all variables in dk
View(dk) # look carefully at the variables' columns
dk0 = dk # copy dk into dk0 object
dk1 = dk # copy dk into dk1 object
dk1$bw1[dk1$bw1==0] <- NA # remove blacks from dk1
dk0$bw1[dk0$bw1==1] <- NA # remove whites from dk0
dk0 = dk0[complete.cases(dk0),] # complete data for blacks "0"
dk1 = dk1[complete.cases(dk1),] # complete data for whites "1"
cronbach(dk[,1:10]) # items submitted to cronbach's alpha are columns 1-10
cronbach.alpha(dk[,1:10]) # items submitted to cronbach's alpha are columns 1-10
alpha(dk[,1:10]) # items submitted to cronbach's alpha are columns 1-10
alpha(dk0[,1:10]) # repeat analysis for blacks only
alpha(dk1[,1:10]) # repeat analysis for whites only
Multiple regression
Two formulas are possible : lm(d$y ~ d$x1 + d$x2) or lm(y ~ x1 + x2 , data=d). You can use sampling weight, and you can use a subset of the data, e.g., analysis on males only by using subset=gender<1. The inconvenience is that fitted() won't probably be working.
require(car) # get VIF() function
require(dplyr) # get select() function
require(lattice) # xyplot() function
require(lsr) # get standardCoefs() funtion
dm0 = select(d, wordsum, bw1, cohortdummy2, cohortdummy3, cohortdummy4, cohortdummy5, cohortdummy6, bw1C2, bw1C3, bw1C4, bw1C5, bw1C6, cohort6, gender, age9, weight)
dm1 = select(d, wordsum, bw1, cohortdummy2, cohortdummy3, cohortdummy4, cohortdummy5, cohortdummy6, bw1C2, bw1C3, bw1C4, bw1C5, bw1C6, aged1, aged2, aged3, aged4, aged6, aged7, aged8, aged9, cohort6, gender, age9, weight)
dm0 = dm0[complete.cases(dm0),] # complete data for model m0, allows you to get an actual variable for Yhat by using fitted()
dm1 = dm1[complete.cases(dm1),] # complete data for model m1, allows you to get an actual variable for Yhat by using fitted()
m0<- lm(wordsum ~ bw1 + cohortdummy2 + cohortdummy3 + cohortdummy4 + cohortdummy5 + cohortdummy6 + bw1C2 + bw1C3 + bw1C4 + bw1C5 + bw1C6, data=dm0, weights=dm0$weight)
summary(m0)
m1<- lm(wordsum ~ bw1 + cohortdummy2 + cohortdummy3 + cohortdummy4 + cohortdummy5 + cohortdummy6 + bw1C2 + bw1C3 + bw1C4 + bw1C5 + bw1C6 + aged1 + aged2 + aged3 + aged4 + aged6 + aged7 + aged8 + aged9, data=dm1, weights=dm1$weight)
summary(m1)
standardCoefs(m1) # unstandardized and standardized betas
confint(m1, level=.99) # confidence intervals of unstandardized coefficients at 0.99 CI level
plot(m1) # fitted vs resid, Q-Q, scale-location, resid vs leverage (push ENTER if you want to see the graphs)
plot(m1, which=1) # residuals vs fitted
plot(m1, which=2) # normal QQ (standardized residuals vs theoretical quantiles)
plot(m1, which=3) # scale-location (√standardized residuals vs fitted values)
plot(m1, which=4) # Cook's distance vs observations' number
plot(m1, which=5) # standardized residuals vs leverage
dm1$res1<-residuals(m1) # save residualized Y, but residuals() should better be used with complete.cases
dm1$stdres1<-rstandard(m1) # standardized residuals
dm1$pred1<-fitted(m1) # save predicted Y, but fitted() should better be used with complete.cases
qqnorm(res1, ylab="residualized values") # data points with no fitted line
qqline(res1, ylab="residualized values") # add fitted line to previous function
residualPlots(m1) # plots of Pearson residuals against each independent variables
vif(m1) # VIF needs "car" package
anova(m0, m1)
xyplot(dm1$pred1 ~ dm1$cohort6 | dm1$age9, data=dm1, groups=bw1, pch=19, type=c("p"), col = c('red', 'blue'), grid=TRUE, ylab="Wordsum predicted by OLS regression", xlab="age", key=list(text=list(c("Black", "White")), points=list(pch=c(19,19), col=c("red", "blue")), columns=2)) # multi-panel plot of Yhat against cohort6 for each category of age9
ANOVA can give you the significance of the difference in model fit of our two regressions, but remember that significance test is affected by sample size. I would largely prefer to employ the Cohen’s f² measure of effect size suggested by Selya et al. (2012).
Errors-in-Variables (EIV) regression
This is a linear regression which corrects individually for measurement error in the independent variables (although I recommend SEM whenever it’s possible). There is no R package available but Culpepper & Aguinis (2011) have a handmade program. The results are identical to those obtained from the eivreg function in Stata. The intercept, R² and confidence intervals seem to be missing however.
eiv<-function(formula,reliability,data){
mfx<-model.matrix(formula,data=data)
p<-length(mfx[1,])-1;n<-length(mfx[,1])
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "weights", "na.action", "offset"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
mf<-data.frame(mf)
MXX<-var(mfx[,c(2:(p+1))]);MXY<-var(mfx[,c(2:(p+1))],mf[,1])
Suu<-matrix(0,p,p)
if(p==1) Suu=(1-reliability)*MXX else
if(p>1) diag(Suu)<-(1-reliability)*diag(MXX)
Mxx<-MXX-(1-p/n)*Suu;Btilde<-solve(Mxx)%*%MXY
vY=var(mf[,1])
MSEtilde<-as.numeric(n*(vY-2*t(Btilde)%*%MXY+t(Btilde)%*%MXX%*%Btilde)/(n-p-1))
Rhat<-matrix(0,p,p);diag(Rhat)<-(t(Btilde)%*%Suu)^2
VCtilde<-MSEtilde*(1/n)*solve(Mxx)+(1/n)*solve(Mxx)%*%(Suu*MSEtilde+Suu%*%Btilde%*%t(Btilde)%*%Suu+2*Rhat)%*%solve(Mxx)
ttilde<-Btilde/sqrt(diag(VCtilde))
output<-cbind(reliability,Btilde,sqrt(diag(VCtilde)),ttilde,2*(1-pt(abs(ttilde),n-p)))
colnames(output)<-c('Reliability','Est.','S.E.','t','Prob.(>|t|)')
output
}
eiv(educ~wordsum+logincome+age,reliability=c(.71,.75,1),data=d)
Relative weight (regression) analysis
This is a technique which is claimed to be able to disentangle the tangle of correlations between the independent variables. According to its proponents, relative importance analysis is able to tell which independent variable is the strongest predictor, while classical regression cannot.
library(yhat) # use yhat package for dominance/commonality regression analysis
Meng_Hu_is_a_mean_boy_dont_look_at_him<-lm(wordsum~educ+degree+logincome+age,data=d)
dammit<-regr(Meng_Hu_is_a_mean_boy_dont_look_at_him)
dammit$Beta_Weights
dammit$Structure_Coefficients
dammit$Commonality_Data
Tobit regression
I know of two functions. censReg() and vglm() from censReg and VGAM packages. censReg displays numbers not available in VGAM, so it’s complementary, however it does not (yet) allow sampling weights. Tobit requires you specify the right or upper value of the dependent variable to be censored. Here, wordsum has values 0-10, so I use 10 as the upper limit.
library(censReg) # censReg() function
library(VGAM) # vglm() function
library(lattice) # xyplot() function
still_hate_R<- censReg(wordsum ~ bw1 + agec + agec2 + agec3 + bw1agec + bw1agec2 + bw1agec3, right=10, data=d)
summary(still_hate_R)
So_complicated<- vglm(wordsum ~ bw1 + agec + agec2 + agec3 + bw1agec + bw1agec2 + bw1agec3, tobit(Upper=10), data=d, weights=d$weight)
summary(So_complicated)
b <- coef(So_complicated)
se <- sqrt(diag(vcov(So_complicated)))
cbind(LL = b - qnorm(0.975) * se, UL = b + qnorm(0.975) * se) # CIs
d$yhat <- fitted(So_complicated)[,1]
d$wordsumpredictedage<- 5.210807+1.355356*d$bw1+-.0051079*d$agec+-.0016632*d$agec2+.000017*d$agec3+.0169752*d$bw1agec+.0002868*d$bw1agec2+.0000104*d$bw1agec3
d$wordsumpredictedage<-d$wordsumpredictedage+d$wordsum+d$bw1+d$age
xyplot(d$wordsumpredictedage ~ d$age, data=d, groups=bw1, pch=19, type=c("p"), col = c('red', 'blue'), grid=TRUE, ylab="Wordsum predicted by tobit", xlab="age", key=list(text=list(c("Black", "White")), points=list(pch=c(19,19), col=c("red", "blue")), columns=2))
Don’t try to extract residuals, even though you can. Stata knows it’s wrongdoing and will tell you this procedure is incorrect, but R won’t let you know. In tobit, the Y is not treated as an observed but as a latent variable. The residuals can’t be extracted the usual way. You must use the method elaborated by Cameron & Trivedi (2009, pp. 535-538).
Logistic regression
LR evaluates how the probability of scoring 1 (versus 0) in the binary Y variable may vary across the values of the group variables or continuous variables of interest. Usually, people use socio-economic variables. But one can perform a Differential Item Functioning (DIF) test with LR (Oliveri et al., 2012). The theory is that an unbiased item’s response across groups must produce identical curves for all the groups when conditioned on total test score and the interaction of group membership with total score.
library(lattice) # xyplot() function
require(dplyr) # get select() function
dk<- select(d, worda, wordb, wordc, wordd, worde, wordf, wordg, wordh, wordi, wordj, wordsum, bw1)
dk = dk[complete.cases(dk),] # keep only cases with no missing data on all variables in dk
dk$bw1wordsum<- dk$bw1*dk$wordsum
Meng_Hu_controls_the_Illuminati<- glm(wordg ~ wordsum + bw1 + bw1wordsum, data=dk, family=binomial("logit")) # request logit
summary(Meng_Hu_controls_the_Illuminati)
dk$fittedlogit1 = fitted(Meng_Hu_controls_the_Illuminati)
anova(Meng_Hu_controls_the_Illuminati, test="Chisq") # gives p-values and Deviance and changes in Deviance between nested models for variables added sequentially
drop1(Meng_Hu_controls_the_Illuminati, test="Chisq") # gives p-values and Deviance and changes in Deviance between models with and witout a given variable
dk$logit1 = -4.12497 + 0.47041*dk$wordsum + -2.95061*dk$bw1 + 0.49316*dk$bw1wordsum
dk$odds1 = exp(dk$logit1)
dk$probability1 = dk$odds1/(1+dk$odds1)
xyplot(dk$probability1~ dk$wordsum, data=dk, groups=bw1, pch=19, type=c("p"), col = c('red', 'blue'), grid=TRUE, ylab="probability of correct answer in word g", xlab="wordsum total score", key=list(text=list(c("Black", "White")), points=list(pch=c(19,19), col=c("red", "blue")), columns=2))
xyplot(dk$fittedlogit1~ dk$wordsum, data=dk, groups=bw1, pch=19, type=c("p"), col = c('red', 'blue'), grid=TRUE, ylab="probability of correct answer in word g", xlab="wordsum total score", key=list(text=list(c("Black", "White")), points=list(pch=c(19,19), col=c("red", "blue")), columns=2))
anova() shows if the model fit is significantly different when variables are sequentially added and drop1 () shows if the model fit is significantly different when a given variable is dropped from the full model (i.e., with all x variables). Both functions could have been used with F, LRT, Rao, or Cp, instead of Chisq. See help(anova).
Multilevel (linear) regression
I use package lme4 for LME model. REML is the default option. You put FALSE if you want ML estimations. If you estimate a random intercept model, you do (1 | cohort21), but if you estimate a random slope model, you do : (bw1 | cohort21), where bw1 is the slope. You can specify multiple random slopes. If you want to add another level, let’s say, with survey year, you do (bw1 | cohort21) + (1 | year). In this way, with age as fixed effects, you can completely isolate the variation of Y solely due to age, period, and cohort effects. The APC model of Yang & Land (2008).
Package MuMIn is also needed if you want to compute the R²GLMM of Johnson (2014) for random intercept and random slope. Johnson himself has a self-made syntax (see below) which produces similar result as in the MuMIn. The package arm is needed if you want the standard errors of random coefficients.
require(lme4) # multilevel regression ('lme' does not work on my recent version of R)
require(arm) # extract standard errors of model coefficients
require(MuMIn) # r.squaredGLMM() function
require(lattice) # qqmath() function
model.ri<- lmer(wordsum ~ bw1 + aged1 + aged2 + aged3 + aged4 + aged6 + aged7 + aged8 + aged9 + bw1aged1 + bw1aged2 + bw1aged3 + bw1aged4 + bw1aged6 + bw1aged7 + bw1aged8 + bw1aged9 + (1 | cohort21) + (1 | year), data = d, REML=TRUE)
summary(model.ri) # fixed effects + -2LL stats
display(model.ri) # get AIC and DIC model fit indices
coef(model.ri) # random effect coefficients (model intercept + random residuals)
se.coef(model.ri) # their standard errors
ranef(model.ri) # random (i.e., intercept) residuals
qqmath(ranef(model.ri, condVar=TRUE)) # QQ plot of intercept residuals
coefs<- data.frame(coef(summary(model.ri)))
coefs$p.z <- 2 * (1 - pnorm(abs(coefs$t.value)))
coefs # get p-values of fixed effects
model.rs<- lmer(wordsum ~ bw1 + aged1 + aged2 + aged3 + aged4 + aged6 + aged7 + aged8 + aged9 + bw1aged1 + bw1aged2 + bw1aged3 + bw1aged4 + bw1aged6 + bw1aged7 + bw1aged8 + bw1aged9 + (bw1 | cohort21) + (1 | year), data = d, REML=TRUE)
summary(model.rs) # fixed effects + -2LL stats
display(model.rs) # get AIC and DIC model fit indices
coef(model.rs) # random effect coefficients (model intercept/slope + random residuals)
se.coef(model.rs) # their standard errors
ranef(model.rs) # random (i.e., intercept/slope) residuals
qqmath(ranef(model.rs, condVar=TRUE)) # QQ plot of intercept/slope residuals
coefs<- data.frame(coef(summary(model.rs)))
coefs$p.z <- 2 * (1 - pnorm(abs(coefs$t.value)))
coefs # get p-values of fixed effects
X <- model.matrix(model.ri)
n <- nrow(X)
Beta <- fixef(model.ri)
Sf <- var(X %*% Beta)
Sigma.list <- VarCorr(model.ri)
Sl <-
sum(
sapply(Sigma.list,
function(Sigma)
{
Z <-X[,rownames(Sigma)]
sum(rowSums((Z %*% Sigma) * Z))/n
}))
Se <- attr(Sigma.list, "sc")^2
Sd <- 0
total.var <- Sf + Sl + Se + Sd
(Rsq.m <- Sf / total.var)
(Rsq.c <- (Sf + Sl) / total.var)
rm(X, n, Beta, Sf, Sigma.list, Sl, Se, Sd, total.var, Rsq.m, Rsq.c) # remove all these stored objects
X <- model.matrix(model.rs)
n <- nrow(X)
Beta <- fixef(model.rs)
Sf <- var(X %*% Beta)
Sigma.list <- VarCorr(model.rs)
Sl <-
sum(
sapply(Sigma.list,
function(Sigma)
{
Z <-X[,rownames(Sigma)]
sum(rowSums((Z %*% Sigma) * Z))/n
}))
Se <- attr(Sigma.list, "sc")^2
Sd <- 0
total.var <- Sf + Sl + Se + Sd
(Rsq.m <- Sf / total.var)
(Rsq.c <- (Sf + Sl) / total.var)
r.squaredGLMM(model.ri)
r.squaredGLMM(model.rs)
chistat<- anova(model.ri, model.rs)[2,"Chisq"] # χ2 but model refitted with ML instead of REML
0.5 * pchisq(chistat, 1, lower.tail=FALSE) + 0.5 * pchisq(chistat, 2, lower.tail=FALSE)
require(dplyr) # get select() function
dk<- select(d, wordsum, bw1, aged1, aged2, aged3, aged4, aged6, aged7, aged8, aged9, bw1aged1, bw1aged2, bw1aged3, bw1aged4, bw1aged6, bw1aged7, bw1aged8, bw1aged9, age9, cohort21)
dk = dk[complete.cases(dk),] # keep only cases with no missing data on all variables in dk
lme1<- lmer(wordsum ~ bw1 + aged1 + aged2 + aged3 + aged4 + aged6 + aged7 + aged8 + aged9 + bw1aged1 + bw1aged2 + bw1aged3 + bw1aged4 + bw1aged6 + bw1aged7 + bw1aged8 + bw1aged9 + (bw1 | cohort21), data = dk, REML=TRUE)
dk$fittedlme1<-fitted(lme1) # fitted() works only with complete.cases
xyplot(dk$fittedlme1 ~ dk$cohort21 | dk$age9, data=dk, groups=bw1, pch=19, type=c("p"), col = c('red', 'blue'), grid=TRUE, ylab="Wordsum predicted by multilevel random slope model", xlab="age", key=list(text=list(c("Black", "White")), points=list(pch=c(19,19), col=c("red", "blue")), columns=2)) # multi-panel plot of Yhat against cohort6 for each category of age9
Differential Item Functioning analyses
Below, I show how to use difR. It’s tedious. Because the items are selected not based on their variables’ name but on the order listed in the data window. So let’s create a subset of our data d into the object dk. We get a dk data with 12 variables. Let’s look at the screenshot.
require(difR) # get difR package for summary stats of DIF items with diverse methods
require(dplyr) # get select() function
dk<- select(d, worda, wordb, wordc, wordd, worde, wordf, wordg, wordh, wordi, wordj, wordsum, bw1)
dk = dk[complete.cases(dk),] # keep only cases with no missing data on all variables in dk
View(dk)
In the above picture, it appears that worda to wordj are listed in columns 2:11 and bw1 in column 13. This is false. If you try to select 13 in the columns, R will shoot you this message : "Error in `[.data.frame`(dk, , 13) : undefined columns selected". Indeed, the column row.names does not count as a real column’s variable. Thus, worda:wordj are in columns 1:10, wordsum in column 11 and bw1 in column 12. Given this, we do :
mantelHaenszel(dk[,1:10], dk[,12], correct=TRUE, exact=FALSE, anchor=c(6,8:10)) # columns 1:10 for worda:wordj, column 12 for bw1, columns 6, 8, 9, 10 (word f, h, i, j) are specified as anchor variables (i.e., DIF free)
difMH(dk[,1:10], dk[,12], focal.name=0, MHstat="MHChisq", correct=TRUE, exact=FALSE, purify=TRUE, nrIter=10) # focal group (i.e., black) is coded 0 in bw1
plot(difMH(dk[,1:10], dk[,12], focal.name=0, MHstat="MHChisq", correct=TRUE, exact=FALSE, purify=TRUE, nrIter=10)) # plot of χ2 stats for each item (do not try to save it into an object and request it after because it won't work)
difLogistic1<-difLogistic(dk[,1:10], dk[,12], focal.name=0, type="udif", criterion="LRT", alpha=0.05, purify=TRUE, nrIter=10, save.output=TRUE) # uniform DIF
difLogistic2<-difLogistic(dk[,1:10], dk[,12], focal.name=0, type="nudif", criterion="LRT", alpha=0.05, purify=FALSE, nrIter=10, save.output=TRUE) # non-uniform DIF
difLogistic3<-difLogistic(dk[,1:10], dk[,12], focal.name=0, type="both", criterion="LRT", alpha=0.05, purify=FALSE, nrIter=10, save.output=TRUE) # uniform and non-uniform DIF
difLogistic1
difLogistic2
difLogistic3
plot(difLogistic1, save.plot = TRUE, save.options = c("difLogistic1", "default", "jpeg")) # plot LRT or Wald statistics per item for "udif"
plot(difLogistic2, save.plot = TRUE, save.options = c("difLogistic2", "default", "jpeg")) # plot LRT or Wald statistics per item for "nudif"
plot(difLogistic3, save.plot = TRUE, save.options = c("difLogistic3", "default", "jpeg")) # plot LRT or Wald statistics per item for "both"
plot(difLogistic1, plot="itemCurve", item="worda", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordA", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordb", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordB", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordc", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordC", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordd", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordD", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="worde", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordE", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordf", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordF", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordg", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordG", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordh", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordH", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordi", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordI", "default", "jpeg"))
plot(difLogistic1, plot="itemCurve", item="wordj", save.plot = TRUE, save.options = c("ICCdifLogisticRace1wordJ", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="worda", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordA", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordb", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordB", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordc", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordC", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordd", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordD", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="worde", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordE", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordf", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordF", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordg", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordG", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordh", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordH", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordi", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordI", "default", "jpeg"))
plot(difLogistic2, plot="itemCurve", item="wordj", save.plot = TRUE, save.options = c("ICCdifLogisticRace2wordJ", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="worda", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordA", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordb", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordB", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordc", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordC", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordd", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordD", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="worde", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordE", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordf", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordF", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordg", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordG", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordh", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordH", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordi", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordI", "default", "jpeg"))
plot(difLogistic3, plot="itemCurve", item="wordj", save.plot = TRUE, save.options = c("ICCdifLogisticRace3wordJ", "default", "jpeg"))
Logistik1<-Logistik(dk[,1:10], dk[,12], anchor=c(6,8:10), type="udif", criterion="LRT") # uniform DIF
Logistik2<-Logistik(dk[,1:10], dk[,12], anchor=c(6,8:10), type="nudif", criterion="LRT") # non-uniform DIF
Logistik3<-Logistik(dk[,1:10], dk[,12], anchor=c(6,8:10), type="both", criterion="LRT") # uniform and non-uniform DIF
Logistik1
Logistik2
Logistik3
difRaju1<-difRaju(dk[,1:10], group=dk[,12], focal.name=0, model="1PL", purify=TRUE, signed=TRUE)
difRaju2<-difRaju(dk[,1:10], group=dk[,12], focal.name=0, model="2PL", purify=TRUE, signed=TRUE)
difRaju3<-difRaju(dk[,1:10], group=dk[,12], focal.name=0, model="3PL", c=0.05, purify=FALSE, signed=TRUE)
difRaju1
difRaju2
difRaju3
plot(difRaju1, save.plot = TRUE, save.options = c("difRaju1", "default", "jpeg")) # plot Raju's statistic per item for 1PL
plot(difRaju2, save.plot = TRUE, save.options = c("difRaju2", "default", "jpeg")) # plot Raju's statistic per item for 2PL
plot(difRaju3, save.plot = TRUE, save.options = c("difRaju3", "default", "jpeg")) # plot Raju's statistic per item for 3PL
difLord1<-difLord(dk[,1:10], group=dk[,12], focal.name=0, model="1PL", engine="ltm", discr=1, irtParam=NULL, same.scale=TRUE, alpha=0.05, purify=TRUE, nrIter=10)
difLord2<-difLord(dk[,1:10], group=dk[,12], focal.name=0, model="2PL", engine="ltm", discr=1, irtParam=NULL, same.scale=TRUE, alpha=0.05, purify=FALSE, nrIter=10)
difLord3<-difLord(dk[,1:10], group=dk[,12], focal.name=0, model="3PL", engine="ltm", discr=1, irtParam=NULL, same.scale=TRUE, alpha=0.05, c=0.05, purify=FALSE, nrIter=10)
difLord1
difLord2
difLord3
plot(difLord1, save.plot = TRUE, save.options = c("difLord1", "default", "jpeg")) # plot Lord's χ2 per item for 1PL
plot(difLord2, save.plot = TRUE, save.options = c("difLord2", "default", "jpeg")) # plot Lord's χ2 per item for 2PL
plot(difLord3, save.plot = TRUE, save.options = c("difLord3", "default", "jpeg")) # plot Lord's χ2 per item for 3PL
plot(difLord1, plot="itemCurve", item="worda", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordA", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordb", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordB", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordc", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordC", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordd", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordD", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="worde", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordE", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordf", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordF", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordg", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordG", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordh", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordH", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordi", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordI", "default", "jpeg"))
plot(difLord1, plot="itemCurve", item="wordj", save.plot = TRUE, save.options = c("ICCdifLordRace1PLwordJ", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="worda", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordA", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordb", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordB", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordc", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordC", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordd", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordD", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="worde", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordE", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordf", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordF", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordg", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordG", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordh", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordH", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordi", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordI", "default", "jpeg"))
plot(difLord2, plot="itemCurve", item="wordj", save.plot = TRUE, save.options = c("ICCdifLordRace2PLwordJ", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="worda", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordA", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordb", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordB", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordc", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordC", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordd", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordD", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="worde", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordE", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordf", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordF", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordg", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordG", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordh", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordH", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordi", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordI", "default", "jpeg"))
plot(difLord3, plot="itemCurve", item="wordj", save.plot = TRUE, save.options = c("ICCdifLordRace3PLwordJ", "default", "jpeg"))
dichoDif(dk[,1:10], group=dk[,12], focal.name=0, method=c("TID","MH","Std", "Logistic", "Lord", "Raju"), model="1PL", correct=FALSE, thrSTD=0.08, thrTID=1, purify=FALSE)
dichoDif(dk[,1:10], group=dk[,12], focal.name=0, method=c("TID","MH","Std", "Logistic", "Lord", "Raju"), model="2PL", correct=FALSE, thrSTD=0.08, thrTID=1, purify=FALSE)
Notice that column 11 is not even specified in the syntax. The reason, I think, is because the total score variable wordsum is used as scaling factor. Here’s a description of the arguments used :
anchor=c() specifies the item(s) included in your anchor set.
correct asks : should the continuity correction be used? (default is TRUE).
purify asks : should the method be used iteratively to purify the set of anchor items? (default is FALSE).
nrIter sets the maximal number of iterations in the item purification process (default is 10).
thrSTD is the threshold (cut-score) for standardized P-DIF statistic (default is 0.10).
thrTID is the threshold for detecting DIF items with TID method (default is 1.5).
save.output will save your output into a new file in your computer’s folder.
PCA, FA, CFA and MGCFA
Factor Analysis or Principal Component Analysis should be carried out. We get the pattern of the factor loading to be used in CFA/MGCFA. We also determine the numbers of factors to be extracted. In most application of MGCFA, I don’t see people using parallel analysis to examine how many factors to extract (although it should be noted that it is sensitive to sample size). Sometimes, they use Scree plot, but eyeballing this tedious plot is no easy task. And sometimes they use the Kaiser's eigenvalue-greater-than-one rule, which is the worst one, since the cut-off is arbitrary and there is no clear delimitation (e.g., a factor with an eigenvalue of 1.01 considered as major and another of 0.99 as trivial). A recommended reading is Courtney (2013). Another approach is to look the theory within field of study for indications of how many factors to expect, given their interpretability. That may be a better approach because it’s letting the science to drive the statistic rather than the statistic to drive the science.
When this step is done, we combine the two covariances (group1+group2) and we assess the model fit of equality in the pattern of loadings, in factor loadings, in means (intercepts) and, eventually, the equality in residuals. These are known as configural invariance, metric invariance, scalar invariance, residual invariance.
The following example is taken from the input data analyzed by Dolan (2000). This is an attempt to replication. The purpose is to test a model with and without second-order latent factor.
library(lavaan) # get SEM, CFA, MGCFA programs
library(GPArotation) # get quartimax rotation
library(leaps) # FactoMineR() does not work without leaps installed or loaded
library(FactoMineR) # get PCA() function
library(psych) # get fa.parallel() function
library(nFactors) # parallel() function to determine number of factors to be extracted
library(semTools) # get measurementInvariance() function
black.means<-c(8.09, 7.91, 8.63, 7.86, 7.83, 9.18, 9.12, 8.12, 8.10, 7.70, 7.89, 8.86, 8.39)
black.sd<-c(2.65, 2.92, 2.75, 2.76, 2.53, 3.19, 2.95, 3.03, 3.03, 2.70, 2.96, 2.93, 3.22)
black.cor<-matrix(, nrow=13, ncol=13)
black.cor[lower.tri(black.cor, diag=TRUE)]<-c(1.00, 0.55, 0.53,
0.63, 0.49, 0.43, 0.32, 0.42, 0.29, 0.37, 0.31, 0.21, 0.26,
1.00, 0.46, 0.65, 0.48, 0.34, 0.21, 0.43, 0.36, 0.41, 0.36,
0.26, 0.24, 1.00, 0.52, 0.39, 0.50, 0.30, 0.32, 0.23, 0.40,
0.28, 0.28, 0.22, 1.00, 0.63, 0.41, 0.25, 0.43, 0.36, 0.41,
0.34, 0.28, 0.25, 1.00, 0.35, 0.24, 0.44, 0.38, 0.38, 0.35,
0.26, 0.30, 1.00, 0.43, 0.28, 0.30, 0.35, 0.25, 0.25, 0.28,
1.00, 0.29, 0.26, 0.26, 0.17, 0.25, 0.26, 1.00, 0.37, 0.48,
0.49, 0.16, 0.36, 1.00, 0.37, 0.41, 0.21, 0.32, 1.00, 0.57,
0.43, 0.29, 1.00, 0.39, 0.19, 1.00, 0.18, 1.00)
black.cor[upper.tri(black.cor, diag=TRUE)]<-t(black.cor)[upper.tri(black.cor, diag=TRUE)]
dimnames(black.cor)<-list(c("Information", "Similarities", "Arithmetic", "Vocabulary", "Comprehension", "DigitSpan", "TappingSpan", "PictureCompletion", "PictureArrangement", "BlockDesign", "ObjectAssembly", "Coding", "Mazes"), c("Information", "Similarities", "Arithmetic", "Vocabulary", "Comprehension", "DigitSpan", "TappingSpan", "PictureCompletion", "PictureArrangement", "BlockDesign", "ObjectAssembly", "Coding", "Mazes"))
black.cov<-cor2cov(black.cor, black.sd)
white.means<-c(10.41, 10.29, 10.37, 10.42, 10.44, 10.08, 10.09, 10.41, 10.37, 10.39, 10.73, 10.22, 10.41)
white.sd<-c(2.91, 3.01, 2.84, 2.94, 2.81, 3.00, 2.87, 2.87, 2.91, 2.92, 3.01, 3.30, 3.06)
white.cor<-matrix(, nrow=13, ncol=13)
white.cor[lower.tri(white.cor, diag=TRUE)]<-c(1.00, 0.58, 0.51,
0.66, 0.51, 0.34, 0.25, 0.35, 0.37, 0.44, 0.34, 0.26, 0.22,
1.00, 0.43, 0.63, 0.55, 0.33, 0.19, 0.40, 0.37, 0.45, 0.35,
0.25, 0.24, 1.00, 0.48, 0.40, 0.42, 0.32, 0.30, 0.26, 0.41,
0.23, 0.29, 0.24, 1.00, 0.61, 0.36, 0.24, 0.38, 0.39, 0.43,
0.33, 0.29, 0.21, 1.00, 0.23, 0.19, 0.35, 0.34, 0.38, 0.29,
0.23, 0.23, 1.00, 0.37, 0.16, 0.18, 0.29, 0.17, 0.28, 0.18,
1.00, 0.16, 0.19, 0.27, 0.15, 0.25, 0.19, 1.00, 0.34, 0.47,
0.41, 0.15, 0.29, 1.00, 0.41, 0.37, 0.22, 0.27, 1.00, 0.56,
0.30, 0.39, 1.00, 0.20, 0.31, 1.00, 0.18, 1.00)
white.cor[upper.tri(white.cor, diag=TRUE)]<-t(white.cor)[upper.tri(white.cor, diag=TRUE)]
dimnames(white.cor)<-list(c("Information", "Similarities", "Arithmetic", "Vocabulary", "Comprehension", "DigitSpan", "TappingSpan", "PictureCompletion", "PictureArrangement", "BlockDesign", "ObjectAssembly", "Coding", "Mazes"), c("Information", "Similarities", "Arithmetic", "Vocabulary", "Comprehension", "DigitSpan", "TappingSpan", "PictureCompletion", "PictureArrangement", "BlockDesign", "ObjectAssembly", "Coding", "Mazes"))
white.cov<-cor2cov(white.cor, white.sd)
combined.cor<-list(black=black.cor, white=white.cor)
combined.cov<-list(black=black.cov, white=white.cov)
combined.n<-list(black=305, white=1868)
combined.means<-list(black=black.means, white=white.means)
combined.cov
Mc<-function(object, digits=3){
fit<-inspect(object, "fit")
chisq=unlist(fit["chisq"])
df<-unlist(fit["df"])
n<-object@SampleStats@ntotal
ncp<-max(chisq-df,0)
d<-ncp/(n-1)
Mc=exp((d)*-.5)
Mc # Mcdonald's NCI (function made by Alexander Beaujean)
}
black.ev<-eigen(cor(black.cor))
black.ap<-parallel(subject=nrow(black.cor),var=ncol(black.cor), rep=100,cent=.05)
black.nS<-nScree(x=black.ev$values, aparallel=black.ap$eigen$qevpea)
plotnScree(black.nS) # how many factors to retain (using Scree plot)
white.ev<-eigen(cor(white.cor))
white.ap<-parallel(subject=nrow(white.cor),var=ncol(white.cor), rep=100,cent=.05)
white.nS<-nScree(x=white.ev$values, aparallel=white.ap$eigen$qevpea)
plotnScree(white.nS) # how many factors to retain (using Scree plot)
fa.parallel(black.cor, n.obs=305) # parallel analysis scree plot of factor analysis and principal component analysis
fa.parallel(white.cor, n.obs=1868) # parallel analysis scree plot of factor analysis and principal component analysis
vssblack=VSS(black.cor)
vssblack
VSS.plot(vssblack)
vssminresblack=VSS(black.cor, n=8, fm="minres", n.obs=426, plot=TRUE, title="Very Simple Structure for black correlation")
vssminresblack
VSS.plot(vssminresblack)
vssmleblack=VSS(black.cor, n=8, fm="mle", n.obs=426, plot=TRUE, title="Very Simple Structure for black correlation")
vssmleblack
VSS.plot(vssmleblack)
vsspablack=VSS(black.cor, n=8, fm="pa", n.obs=426, plot=TRUE, title="Very Simple Structure for black correlation")
vsspablack
VSS.plot(vsspablack)
vsswhite=VSS(white.cor)
vsswhite
VSS.plot(vsswhite)
vssminreswhite=VSS(white.cor, n=8, fm="minres", n.obs=426, plot=TRUE, title="Very Simple Structure for black correlation")
vssminreswhite
VSS.plot(vssminreswhite)
vssmlewhite=VSS(white.cor, n=8, fm="mle", n.obs=426, plot=TRUE, title="Very Simple Structure for black correlation")
vssmlewhite
VSS.plot(vssmlewhite)
vsspawhite=VSS(white.cor, n=8, fm="pa", n.obs=426, plot=TRUE, title="Very Simple Structure for black correlation")
vsspawhite
VSS.plot(vsspawhite)
EFAblack<-factanal(covmat=black.cor, factors=3, rotation="quartimax", n.obs=305) # factanal performs a maximum-likelihood factor analysis
print(EFAblack, digits=3, cutoff=.1) # My preferred cutoff is 0.10
EFAwhite<-factanal(covmat=white.cor, factors=3, rotation="quartimax", n.obs=1868) # factanal performs a maximum-likelihood factor analysis
print(EFAwhite, digits=3, cutoff=.1) # My preferred cutoff is 0.10
PCA(black.cor)
PCA(white.cor)
blackPCA<-princomp(black.cor, cor=TRUE)
summary(blackPCA)
loadings(blackPCA)
plot(blackPCA, type="lines")
biplot(blackPCA)
blackPCA$scores
blackPC<-principal(black.cor, nfactors=3, rotate="quartimax")
blackPC
whitePCA<-princomp(white.cor, cor=TRUE)
summary(whitePCA)
loadings(whitePCA)
plot(whitePCA, type="lines")
biplot(whitePCA)
whitePCA$scores
whitePC<-principal(white.cor, nfactors=3, rotate="quartimax")
whitePC
firstorder.model<- '
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
'
black.fit <- cfa(firstorder.model, sample.cov = black.cov, sample.nobs = 305)
white.fit <- cfa(firstorder.model, sample.cov = white.cov, sample.nobs = 1868)
black.fit
white.fit
blackCFA<-cfa(firstorder.model, sample.cov=black.cov, sample.nobs=305, sample.mean=black.means, meanstructure=TRUE)
fitMeasures(blackCFA, fit.measures="all")
coef(blackCFA)
summary(blackCFA, fit.measures=TRUE)
fitted(blackCFA)
resid(blackCFA, type="standardized")
whiteCFA<-cfa(firstorder.model, sample.cov=white.cov, sample.nobs=1868, sample.mean=white.means, meanstructure=TRUE)
fitMeasures(whiteCFA, fit.measures="all")
coef(whiteCFA)
summary(whiteCFA, fit.measures=TRUE)
fitted(whiteCFA)
resid(whiteCFA, type="standardized")
combined.firstorder.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
'
configural.fit<-cfa(combined.firstorder.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means)
fitMeasures(configural.fit, fit.measures="all")
Mc(configural.fit)
MI.configural<-modificationIndices(configural.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.configural, mi>4)
metric.fit<-cfa(combined.firstorder.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, group.equal=c("loadings"))
fitMeasures(metric.fit, fit.measures="all")
Mc(metric.fit)
MI.metric<-modificationIndices(metric.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.metric, mi>4)
scalar.fit<-cfa(combined.firstorder.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, group.equal=c("loadings", "intercepts"))
fitMeasures(scalar.fit, fit.measures="all")
Mc(scalar.fit)
MI.scalar<-modificationIndices(scalar.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.scalar, mi>4)
strict.fit<-cfa(combined.firstorder.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, group.equal=c("loadings", "intercepts", "residuals"))
fitMeasures(strict.fit, fit.measures="all")
Mc(strict.fit)
MI.strict<-modificationIndices(strict.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.strict, mi>4)
measurementInvariance(combined.firstorder.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means)
combined.highorder0.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
'
combined.highorder1.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
VERB ~ 0*1
PERF ~ 0*1
MEM ~ 0*1
g ~ 0*1
'
combined.highorder2.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
VERB ~ 0*1
PERF ~ 0*1
MEM ~ 0*1
'
combined.highorder3.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
PERF ~0*1
MEM ~0*1
'
combined.highorder4.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
VERB ~0*1
MEM ~0*1
'
combined.highorder5.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
VERB ~0*1
PERF ~0*1
'
combined.highorder6.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
MEM ~0*1
'
combined.highorder7.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
VERB ~0*1
'
combined.highorder8.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
g =~ VERB + PERF + MEM
PERF ~0*1
'
highorder1.fit<-cfa(combined.highorder1.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, group.equal=c("loadings", "residuals", "lv.variances"), group.partial=c("g~~g"))
fitMeasures(highorder1.fit, fit.measures="all")
Mc(highorder1.fit)
MI.highorder1<-modificationIndices(highorder1.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder1, mi>4)
highorder2.fit<-cfa(combined.highorder2.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances"), group.partial=c("g~~g"))
fitMeasures(highorder2.fit, fit.measures="all")
Mc(highorder2.fit)
MI.highorder2<-modificationIndices(highorder2.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder2, mi>4)
highorder3.fit<-cfa(combined.highorder3.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances"), group.partial=c("g~~g", "VERB~~VERB"))
fitMeasures(highorder3.fit, fit.measures="all")
Mc(highorder3.fit)
MI.highorder3<-modificationIndices(highorder3.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder3, mi>4)
highorder4.fit<-cfa(combined.highorder4.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances"), group.partial=c("g~~g", "PERF~~PERF"))
fitMeasures(highorder4.fit, fit.measures="all")
Mc(highorder4.fit)
MI.highorder4<-modificationIndices(highorder4.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder4, mi>4)
highorder5.fit<-cfa(combined.highorder5.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances"), group.partial=c("g~~g", "MEM~~MEM"))
fitMeasures(highorder5.fit, fit.measures="all")
Mc(highorder5.fit)
MI.highorder5<-modificationIndices(highorder5.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder5, mi>4)
highorder6.fit<-cfa(combined.highorder6.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances"), group.partial=c("g~~g", "PERF~~PERF", "VERB~~VERB"))
fitMeasures(highorder6.fit, fit.measures="all")
Mc(highorder6.fit)
MI.highorder6<-modificationIndices(highorder6.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder6, mi>4)
highorder7.fit<-cfa(combined.highorder7.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances"), group.partial=c("g~~g", "PERF~~PERF", "MEM~~MEM"))
fitMeasures(highorder7.fit, fit.measures="all")
Mc(highorder7.fit)
MI.highorder7<-modificationIndices(highorder7.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder7, mi>4)
highorder8.fit<-cfa(combined.highorder8.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances"), group.partial=c("g~~g", "MEM~~MEM", "VERB~~VERB"))
fitMeasures(highorder8.fit, fit.measures="all")
Mc(highorder8.fit)
MI.highorder8<-modificationIndices(highorder8.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.highorder8, mi>4)
lambda.y=matrix(c(
0.804, 0.000, 0.232,
0.821, 0.153, 0.000,
0.364, 0.000, 0.835,
1.000, 0.000, 0.000,
0.752, 0.108, 0.000,
0.000, 0.000, 1.289,
0.000, 0.000, 1.000,
0.244, 0.626, 0.000,
0.297, 0.514, 0.000,
0.000, 0.885, 0.454,
0.000, 1.000, 0.000,
0.000, 0.248, 0.774,
0.000, 0.553, 0.338),13,3,byrow=TRUE)
gamma=matrix(c(1.000,.5188,.3835),3,1)
kappa=matrix(c(-2.5929),1,1)
decomposition.g=lambda.y%*%gamma%*%kappa
decomposition.g
combined.commonfactor1.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
PERF ~0*1
MEM ~0*1
'
combined.commonfactor2.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
VERB ~0*1
MEM ~0*1
'
combined.commonfactor3.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
VERB ~0*1
PERF ~0*1
'
combined.commonfactor4.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
VERB ~0*1
'
combined.commonfactor5.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
MEM ~0*1
'
combined.commonfactor6.model <-'
VERB =~ Information + Similarities + Arithmetic + Vocabulary + Comprehension + PictureCompletion + PictureArrangement
PERF =~ Similarities + Comprehension + PictureCompletion + PictureArrangement + BlockDesign + ObjectAssembly + Coding + Mazes
MEM =~ Information + Arithmetic + DigitSpan + TappingSpan + BlockDesign + Coding + Mazes
PERF ~0*1
'
commonfactor1.fit<-cfa(combined.commonfactor1.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances", "lv.covariances"), group.partial=c("VERB~~VERB"))
fitMeasures(commonfactor1.fit, fit.measures="all")
Mc(commonfactor1.fit)
MI.commonfactor1<-modificationIndices(commonfactor1.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.commonfactor1, mi>4)
commonfactor2.fit<-cfa(combined.commonfactor2.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances", "lv.covariances"), group.partial=c("PERF~~PERF"))
fitMeasures(commonfactor2.fit, fit.measures="all")
Mc(commonfactor2.fit)
MI.commonfactor2<-modificationIndices(commonfactor2.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.commonfactor2, mi>4)
commonfactor3.fit<-cfa(combined.commonfactor3.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances", "lv.covariances"), group.partial=c("MEM~~MEM"))
fitMeasures(commonfactor3.fit, fit.measures="all")
Mc(commonfactor3.fit)
MI.commonfactor3<-modificationIndices(commonfactor3.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.commonfactor3, mi>4)
commonfactor4.fit<-cfa(combined.commonfactor4.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances", "lv.covariances"), group.partial=c("PERF~~PERF", "MEM~~MEM", "PERF~~MEM"))
fitMeasures(commonfactor4.fit, fit.measures="all")
Mc(commonfactor4.fit)
MI.commonfactor4<-modificationIndices(commonfactor4.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.commonfactor4, mi>4)
commonfactor5.fit<-cfa(combined.commonfactor5.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances", "lv.covariances"), group.partial=c("VERB~~VERB", "PERF~~PERF", "VERB~~PERF"))
fitMeasures(commonfactor5.fit, fit.measures="all")
Mc(commonfactor5.fit)
MI.commonfactor5<-modificationIndices(commonfactor5.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.commonfactor5, mi>4)
commonfactor6.fit<-cfa(combined.commonfactor6.model, sample.cov=combined.cov, sample.nobs=combined.n, sample.mean=combined.means, meanstructure=TRUE, group.equal=c("loadings", "intercepts", "residuals", "lv.variances", "lv.covariances"), group.partial=c("VERB~~VERB", "MEM~~MEM", "VERB~~MEM"))
fitMeasures(commonfactor6.fit, fit.measures="all")
Mc(commonfactor6.fit)
MI.commonfactor6<-modificationIndices(commonfactor6.fit, standardized=TRUE, power=FALSE, delta=0.1, alpha=0.05, high.power=0.75)
subset(MI.commonfactor6, mi>4)
After MGCFA, bootstrap is possible but time consuming with large N.
PAR.boot<- bootstrapLavaan(chc.weak, R=100, type="ordinary", FUN="coef") # bootstrap model parameters, unstandardized
describeBy(PAR.boot)
T.boot<- bootstrapLavaan(chc.weak, R = 100, type = "bollen.stine", FUN = fitMeasures, fit.measures = "chisq") # bootstrap test statistic + compute p-value
T.orig<- fitMeasures(chc.weak, "chisq")
pvalue.boot<- length(which(T.boot > T.orig))/length(T.boot)
pvalue.boot
fit.boot<- bootstrapLavaan(chc.weak, R=100, type="nonparametric", FUN=fitMeasures, fit.measures=c("chisq", "cfi", "rmsea", "srmr", "aic", "bic"), parallel="multicore") # bootstrap fit indices
summary(fit.boot) # describeBy would only display 2 digits after decimals
Shu Fai Cheung wrote a piece “Bootstrap Confidence Interval for Standardized Solution in lavaan” that is also very useful, but cfa with bootstrap requires ML. This is not compatible with sampling.weights because it does not use the plain ML estimator. No less useful, Kim & Millsap (2014) proposed a bootstrap approach, along with codes, for evaluating whether the approximate fit index values are acceptable approximation depending on the misspecification.
library(semhelpinghands) # gets you SE and CI for standardized solution
bf.weak<-cfa(chc.bifactor.weak.model, data=dgroup, group="wb", meanstructure=TRUE, std.lv=T, se="bootstrap", bootstrap=100) # can add group.equal and group.partial
fitMeasures(bf.weak, c("chisq", "df", "pvalue", "cfi", "rmsea", "srmr", "aic", "bic"))
Mc(bf.weak)
ci_boot<- standardizedSolution_boot_ci(bf.weak)
print(ci_boot, output = "text")
(Last update: 22nd 7th 2024)