Exemplar 6 R code

names(data)

######################################################################
# First a small example with just the prevalence data and gender
# binary data imputed with MICE
# If you have not done so already you will need to install
# the library from CRAN vai the packages menu
# NOTE as of july 05,on R 2.11 this was missing from the selection
# list on the menu, so instead go to the developers' web site
# http://web.inter.nl.net/users/S.van.Buuren/mi/hmtl/mice.htm
# download the latest zip file and install this
#
# These steps only need doing once
######################################################################

dprev<-data[,c(14,22,25,28,31,34,37)]
names(dprev)

#
# set up library - you need to do this every time you 
# want to use mice. 

#
#  Now the imputations
# You need to specify logistic 
# regression because the codes  are numeric
# maxit has been set to 30 to ensure convergence
#


library(mice)
impprev<-mice(dprev,m=10, maxit=30,imputationMethod = rep("logreg",7),printFlag=T)

#  back to top 
#
# imprev  is a special object that stores the different bits
# of the imputation efficiently. It is a list and the component
# imp gives another list that is the imputed values for each
# imputed variable for cases with missing data only
# These commands will show you it
#

# Before you use plot make sure that you have saved the
#   command file newfuns.R to your computer and
# have sourced it into R (file menu from console). 
# It fixes a problem with the plots
# and summary commands on the current version of R (July 05)
##########################################

plot(impprev,F)

#
# this shows the convergence plots, ideally they
# should be without trend and uncorrelated
# They are here
#

print(impprev)
names(impprev)

#
# the imp component gives the results for 
# the rows with any missing data with all
# the imputations for one variable grouped together
# examine top 10 rows for DPREV6
#

names(impprev$imp)
impprev$imp$DPREV6[1:10,]

#
# complete(,i) gives ith complete imputati
################################################
#  next bit of code gets a matrix to show how
#   the imputed data changes the prevalence
################################################

complmeans<-apply(dprev,2,mean,na.rm=T)[-1]*100
impmeans<-100*rbind(
apply(complete(impprev,1),2,mean)[-1],
apply(complete(impprev,2),2,mean)[-1],
apply(complete(impprev,3),2,mean)[-1],
apply(complete(impprev,4),2,mean)[-1],
apply(complete(impprev,5),2,mean)[-1],
apply(complete(impprev,6),2,mean)[-1],
apply(complete(impprev,7),2,mean)[-1],
apply(complete(impprev,8),2,mean)[-1],
apply(complete(impprev,9),2,mean)[-1],
apply(complete(impprev,10),2,mean)[-1])
allmeans<-rbind(complmeans,impmeans)
print(round(allmeans,2))

#
#  back to top 
#
# post imputation inference for this data set
# first simple linear model predcition of prevalence
# compared to original data
#

 summary(glm(DPREV6~GENDER,dprev))
 summary(pool(glm.mids(DPREV6~GENDER,impprev)))

#
# This only works so far for linear models
# The post imputation functions from NORM
# allow logistic regression, though they are more work
#

library(norm)
coeffs<-list(10)
sds<-list(10)
for (i in 1:10){
coeffs[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,1]
sds[[i]]<- summary(glm(DPREV6~GENDER,complete(impprev,i),family='binomial'))$coeff[,2]
}
result<-mi.inference(coeffs,sds)result
ans<-cbind(result[[1]],result[[2]],result[[8]]*100)
dimnames(ans)[[2]]<-c('mean','se','% info lost')
print(ans)

#  back to top 
#

####################################################
#
# now the same small model fitted with NORM
# This code commented out - it is in the original file
# as there is a serious bug in this package
# It has been reported by others and now
# by me too
#  back to top 
####################################################
#  now a bigger model first remove the CASEID
# as we don't want to use it in imputation
# also the prevalence columns because they 
# can be derived from the others
#
# have also missed off the drug use variables because 
# for some reason they will not be accepted as factors
# even though they only have 4 categories
# names(data)
######################################################

forimp<-data[,c(-1,-22,-25,-28,-31,-34,-37,-40:-45)]
summary(forimp)

#
# make a vector for methods and check it<
# note that norm had to be used for all categories with >4 
# cases
#

method<-c( rep("norm",12),rep("logreg",7),'polyreg',
rep(c('norm'),12))
rbind(names(forimp),method)

#
# maxit increased to 30 when plots clearly not converged
#

impbig<-mice(forimp,imputationMethod =method,m=4,maxit=30)

#
# plot convergence
#

plot(impbig,F)


#  back to top 
#
# now plot at observed and imputed values 
#

par(mfrow=c(2,2))
hist(as.matrix(forimp$DVOL6),main='DVOL6 observed',xlab='volume',col=3)
hist(forimp$DVAR6,main='DVAR6 observed',xlab='variety',col=3)
hist(impbig$imp$DVOL6[,4],main='DVOL6 imputed',xlab='volume',col=4)
hist(impbig$imp$DVAR6[,4],main='DVAR6 imputed',xlab='variety',col=4)

#
# we can see that imputing as normals has given 
# distributions with a range of vALUES
# This is fixed below
# now make a big data frame of complete data
# with the 4 imputations stacked
#

bigcomplete<-complete(impbig,'long')
names(bigcomplete)

# 
#
# now logistic regression analyses for survey data ren=moving missing categories
#

bigcomplete[,21:32]<-round(bigcomplete[,21:32])
bigcomplete[,21:32][bigcomplete[,21:32]<0]<-0
bigcomplete[,22][bigcomplete[,23]==0]<-0 # set vols to 0 if var 0
bigcomplete[,24][bigcomplete[,25]==0]<-0
bigcomplete[,26][bigcomplete[,27]==0]<-0
bigcomplete[,28][bigcomplete[,29]==0]<-0
bigcomplete[,30][bigcomplete[,31]==0]<-0
bigcomplete[,22][bigcomplete[,23]>0]<-1 # set vols to 1 if var> 0
bigcomplete[,24][bigcomplete[,25]>0]<-1
bigcomplete[,26][bigcomplete[,27]>0]<-1
bigcomplete[,28][bigcomplete[,29]>0]<-1
bigcomplete[,30][bigcomplete[,31]>0]<-1
bigcomplete$DPREV1<-bigcomplete$DVAR1 # prev to 1 if var>0
bigcomplete$DPREV2<-bigcomplete$DVAR2
bigcomplete$DPREV3<-bigcomplete$DVAR3
bigcomplete$DPREV4<-bigcomplete$DVAR4
bigcomplete$DPREV5<-bigcomplete$DVAR5
bigcomplete$DPREV6<-bigcomplete$DVAR6
bigcomplete[,35:40][bigcomplete[,35:40]>0]<-1

#
# and make matrix of prevalence for complete data and each iteration
#

prevfrombig<-
cbind(apply(bigcomplete[,35:40][bigcomplete$'.imp'==1,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==2,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==3,],2,mean)*100,
apply(bigcomplete[,35:40][bigcomplete$'.imp'==4,],2,mean)*100)
round(prevfrombig,2)

#  back to top 
#

#
# now repeat this using norm which treats everything as normal
# as mentioned above there is a nasty bug in this code


library(norm)
rngseed(100)
datam<-as.matrix(data[,-c(1,22,25,28,31,34,37)])  # exclude prevalence as this leads to collinearity
 prelim.datam<-prelim.norm(datam)
 theta.datam<-em.norm(prelim.datam)

#
# notice that this goes to 1000 iterations 
# the maximum which looks a bit dodgy
# not likely to improve with more
#

imputed.datam<-imp.norm(prelim.datam,theta.datam,datam)
imputed.datam[1:20,]

#
#   clearly some of these imputed values are nonsense
#