################################################################# # # NOTE BOTH THE PAKAGES FEATURED HERE (NORM AND MICE) # ARE NOT WORKING PROPERLY ON THE LATEST VERSIONS OF # THE R INSTALLATION (VERSION 2.1) # # THIS FILE AND OTHERS ON THE PEAS SITE WILL # BE BROUGHT UP TO DATE WHEN THEY ARE REINSTALLED ################################################ # HOW TO USE THIS FILE # This is an HTML version of the R script file ex2.R # It is intended to show you the code and to allow links. # NOT to use interactively to run the code # Pasting the into R might work, but it might not # to view the ouptut from these commands in html go to R results page
red stuff (R commands)
# # ######### ANY TEXT AFTER THE # CHARACTER (here shown black) # ARE TREATED AS COMMENTS BY R # # ############################################## # # data are in a data frame shs # to get the names of the variables #
Links in this page
Using MICE for a small imputation problem
Post imputation procedures for this
Using norm for a smallimputation problem
MICE imputation procedures for larger data set
Post imputation procedures for larger data set
Trying norm for big data set
names(data)
###################################################################### # First a small example with just the prevalence data and gender # binary data imputed with MICE # If you have not done so already you will need to install # the library from CRAN vai the packages menu # NOTE as of july 05,on R 2.11 this was missing from the selection # list on the menu, so instead go to the developers' web site # http://web.inter.nl.net/users/S.van.Buuren/mi/hmtl/mice.htm # download the latest zip file and install this # # These steps only need doing once ######################################################################
dprev<-data[,c(14,22,25,28,31,34,37)]
names(dprev)
# # set up library - you need to do this every time you # want to use mice. # # Now the imputations # You need to specify logistic # regression because the codes are numeric # maxit has been set to 30 to ensure convergence #
library(mice) impprev<-mice(dprev,m=10, maxit=30,imputationMethod = rep("logreg",7),printFlag=T)
# back to top # # imprev is a special object that stores the different bits # of the imputation efficiently. It is a list and the component # imp gives another list that is the imputed values for each # imputed variable for cases with missing data only # These commands will show you it # # Before you use plot make sure that you have saved the # command file newfuns.R to your computer and # have sourced it into R (file menu from console). # It fixes a problem with the plots # and summary commands on the current version of R (July 05) ##########################################
plot(impprev,F)
# # this shows the convergence plots, ideally they # should be without trend and uncorrelated # They are here #
print(impprev) names(impprev)
# # the imp component gives the results for # the rows with any missing data with all # the imputations for one variable grouped together # examine top 10 rows for DPREV6 #
names(impprev$imp) impprev$imp$DPREV6[1:10,]
# # complete(,i) gives ith complete imputati ################################################ # next bit of code gets a matrix to show how # the imputed data changes the prevalence ################################################
complmeans<-apply(dprev,2,mean,na.rm=T)[-1]*100 impmeans<-100*rbind( apply(complete(impprev,1),2,mean)[-1], apply(complete(impprev,2),2,mean)[-1], apply(complete(impprev,3),2,mean)[-1], apply(complete(impprev,4),2,mean)[-1], apply(complete(impprev,5),2,mean)[-1], apply(complete(impprev,6),2,mean)[-1], apply(complete(impprev,7),2,mean)[-1], apply(complete(impprev,8),2,mean)[-1], apply(complete(impprev,9),2,mean)[-1], apply(complete(impprev,10),2,mean)[-1]) allmeans<-rbind(complmeans,impmeans) print(round(allmeans,2))
# # back to top # # post imputation inference for this data set # first simple linear model predcition of prevalence # compared to original data #
summary(glm(DPREV6~GENDER,dprev)) summary(pool(glm.mids(DPREV6~GENDER,impprev)))
# # This only works so far for linear models # The post imputation functions from NORM # allow logistic regression, though they are more work #
library(norm) coeffs<-list(10) sds<-list(10) for (i in 1:10){ coeffs[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,1] sds[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,2] } result<-mi.inference(coeffs,sds)result ans<-cbind(result[[1]],result[[2]],result[[8]]*100) dimnames(ans)[[2]]<-c('mean','se','% info lost') print(ans)
# back to top #
#################################################### #
# now the same small model fitted with NORM # This code commented out - it is in the original file # as there is a serious bug in this package
# It has been reported by others and now
# by me too # back to top #################################################### # now a bigger model first remove the CASEID # as we don't want to use it in imputation # also the prevalence columns because they # can be derived from the others # # have also missed off the drug use variables because # for some reason they will not be accepted as factors # even though they only have 4 categories # names(data) ######################################################
forimp<-data[,c(-1,-22,-25,-28,-31,-34,-37,-40:-45)] summary(forimp)
# # make a vector for methods and check it< # note that norm had to be used for all categories with >4 # cases #
method<-c( rep("norm",12),rep("logreg",7),'polyreg', rep(c('norm'),12))
rbind(names(forimp),method)
# # maxit increased to 30 when plots clearly not converged #
impbig<-mice(forimp,imputationMethod =method,m=4,maxit=30)
#
# plot convergence #
plot(impbig,F)
# back to top #
# now plot at observed and imputed values
#
par(mfrow=c(2,2)) hist(as.matrix(forimp$DVOL6),main='DVOL6 observed',xlab='volume',col=3) hist(forimp$DVAR6,main='DVAR6 observed',xlab='variety',col=3) hist(impbig$imp$DVOL6[,4],main='DVOL6 imputed',xlab='volume',col=4) hist(impbig$imp$DVAR6[,4],main='DVAR6 imputed',xlab='variety',col=4)
# # we can see that imputing as normals has given # distributions with a range of vALUES # This is fixed below # now make a big data frame of complete data # with the 4 imputations stacked #
bigcomplete<-complete(impbig,'long') names(bigcomplete)
# # # now logistic regression analyses for survey data ren=moving missing categories #
bigcomplete[,21:32]<-round(bigcomplete[,21:32]) bigcomplete[,21:32][bigcomplete[,21:32]<0]<-0 bigcomplete[,22][bigcomplete[,23]==0]<-0 # set vols to 0 if var 0 bigcomplete[,24][bigcomplete[,25]==0]<-0 bigcomplete[,26][bigcomplete[,27]==0]<-0 bigcomplete[,28][bigcomplete[,29]==0]<-0 bigcomplete[,30][bigcomplete[,31]==0]<-0 bigcomplete[,22][bigcomplete[,23]>0]<-1 # set vols to 1 if var> 0 bigcomplete[,24][bigcomplete[,25]>0]<-1 bigcomplete[,26][bigcomplete[,27]>0]<-1 bigcomplete[,28][bigcomplete[,29]>0]<-1 bigcomplete[,30][bigcomplete[,31]>0]<-1 bigcomplete$DPREV1<-bigcomplete$DVAR1 # prev to 1 if var>0 bigcomplete$DPREV2<-bigcomplete$DVAR2 bigcomplete$DPREV3<-bigcomplete$DVAR3 bigcomplete$DPREV4<-bigcomplete$DVAR4 bigcomplete$DPREV5<-bigcomplete$DVAR5 bigcomplete$DPREV6<-bigcomplete$DVAR6 bigcomplete[,35:40][bigcomplete[,35:40]>0]<-1
# # and make matrix of prevalence for complete data and each iteration #
prevfrombig<- cbind(apply(bigcomplete[,35:40][bigcomplete$'.imp'==1,],2,mean)*100, apply(bigcomplete[,35:40][bigcomplete$'.imp'==2,],2,mean)*100, apply(bigcomplete[,35:40][bigcomplete$'.imp'==3,],2,mean)*100, apply(bigcomplete[,35:40][bigcomplete$'.imp'==4,],2,mean)*100) round(prevfrombig,2) # back to top #
# # now repeat this using norm which treats everything as normal # as mentioned above there is a nasty bug in this code
library(norm) rngseed(100) datam<-as.matrix(data[,-c(1,22,25,28,31,34,37)]) # exclude prevalence as this leads to collinearity prelim.datam<-prelim.norm(datam) theta.datam<-em.norm(prelim.datam)
# # notice that this goes to 1000 iterations # the maximum which looks a bit dodgy # not likely to improve with more #
imputed.datam<-imp.norm(prelim.datam,theta.datam,datam) imputed.datam[1:20,]
# # clearly some of these imputed values are nonsense #