##############################################
#
# HOW TO USE THIS FILE
# A text version of this file (ex3.R) is also available which can just be sourced
# into the R program
# This file is intended for you to use interactively by pasting into R
# Comments are in black - red bits are commands that can be pasted into R
#
######### ANY TEXT AFTER THE # CHARACTER ARE
TREATED AS COMMENTS
#
# data stored in
c:/GR/aprojects/peas/web/exemp3/data/ex3.Rdata
# this file is
c:/GR/aprojects/peas/web/exemp3/program_code/ex3_R.html
#
##############################################
#
# data are in a data frame
health
# to get the names of the variables
#
names(health)
#
# the cigarette smoking data is in a factor called health$CIGST1
# to get a table of this variable
do
#
table(health$CIGST1)
#
# the next bit of code makes a new 1/0 variable for smokers
# and stores it in the data frame
(health)
#
health$SMOKER<- (health$CIGST1=='Current cigarette smoker')
health$SMOKER[health$CIGST1=='Dont know']<-NA
health$SMOKER[health$CIGST1=='Refused/Not answered']<-NA
health$SMOKER[health$CIGST1=='schedule not
obtained']<-NA
# to check that it has worked correctly
# you will see that we have set the result to the R missing value (NA)
# when question was not answered
#
table(health$CIGST1,health$SMOKER,exclude=NULL)
#
#
now it is time to try using the survey package
#
to give R access to the survey functions do
#
library(survey)
#
# The first step is to create a SURVEY
DESIGN OBJECT that
# will contain both the data and the information about the design
#
# This design has WEIGHTS (WEIGHTA),
# it is clustered so it has PSUs (PSU)
# and it is stratified (implicitly) by deprivation category within region
# and the PSUs have been grouped into implicit strata (REGSTRAT)
# with just 2 or three PSUs per
stratum
# There are TWO kinds of survey
design
#
#
1. Using simple methods , usually for surveys WITHOUT
POST-STRATIFICATION
#
2. Using replicate weights, usually for surveys with
POST-STARTIFICATION
#
#
The Scottish Health Survey STRATIFICATION and POST-STRATIFICATION so
# to allow for all features we should use
functions of the second kind
# But first we set up a design to allow for clustering weighting and stratification
# that could be analysed by the simple methods
#
health.des <- svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health)
#
# see the html help file for the function svydesign for more explanation
# the ~ sign refers to it being one of the columns of the data
# the R object health.des now contains all the information about the design
# including the data
# To get details of the design
summary(health.des)
#
# now we can use the svymean command to get the smoking rate and its standard error
#
svymean(~SMOKER,health.des,deff=T)
#
# annoyingly this gives a missing result because of the missing values
# but we can get over this by using a command to restrict the survey design
# to the non-missing cases (!=not is.na identifies missing data)
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER)))
# this gives a rate of 0.332 (33.2%) and its standard error of 0.006 (0.6%)
# we can compare this with the corresponding unweighted mean
# which again has to be restricted to the unweighted cases
#
mean(health$SMOKER[!is.na(health$SMOKER)])
#
# this gives the higher rate of 34.79%
#
# The code below plots the illustration in the Web page for this exemplar
# It is restricted to adults under 65 to make it comparable with 1995 data
# To understand it fully you would need to learn more R from one of the help
# guides
#
rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$AGE<65])
wtmean<-print( svymean( ~SMOKER,subset(health.des,!is.na(SMOKER) & AGE<65)) )
par(mfrow=c(1,1))
plot(c(31,40),c(0,1),type='n',xlab='smoking rate
(%)',ylab='',yaxt='n')
legend(31,1,c('weighted rate 95% C. I.','unweighted
rate','1995 weighted rate'),
pch=c(15,16,17),col=c(1,2,3),bty='n',cex=c(1.5,1.5,1.5),lty=c(1,0,0))
points(rawmean*100,0.2,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.2,0.2))
points(wtmean[1]*100,0.2,pch=15,cex=2)
points(35,.2,pch=17,cex=2,col=3)
text(39,.2,'All')
#
# now by sex
#
rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='male'& health$AGE<65])
wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='male'& AGE<65) ) ) )
points(rawmean*100,0.4,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.4,0.4))
points(wtmean[1]*100,0.4,pch=15,cex=2)
points(34,.4,pch=17,cex=2,col=3)
text(39,.4,'male')
rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='female'& health$AGE<65])
wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='female'& AGE<65) ) ) )
points(rawmean*100,0.6,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.6,0.6))
points(wtmean[1]*100,0.6,pch=15,cex=2)
points(36,.6,pch=17,cex=2,col=3)
text(39,.6,'Female')
#
# We can explore how different features affect the design effect by
# setting up different designs (see web page for this exemplar)
#
svymean(~SMOKER,subset(svydesign(id=~IDNO, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
svymean(~SMOKER,subset(svydesign(id=~PSU,
weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
svymean(~SMOKER,subset(svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
# the next stage is to convert it into a design with replicate weights
# we use the default options which produces, by default, jacknife samples
# the BRR method gives almost identical results
# see html help for as.svrepdesign
#
health.rep.des <- as.svrepdesign(health.des)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)))
#
# now we can post-stratify this by region and then by
# age and sex
# This will not affect the weights, since this is already included here
# but it MIGHT improve the precisio of estimates
#
tab.region <- xtabs(GROSSWT~REGION,data=health)
health.rep.des<-postStratify(health.rep.des,~REGION,tab.region)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)
tab.agesex <- xtabs(GROSSWT~SEX+AGEG,data=health)
health.rep.des<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)
#
# In fact you will see that it does little to improve things
#
# now we look at the precision of estimates for regions and Health Boards
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife'))
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian
& Fife'),family='binomial'))
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire'))
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran'))
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial'))
#
# now some logistic regression models to explore influences on SMOKING
#
summary(svyglm(SMOKER~SEX+AGEG+REGION,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+SC+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+CARSTG5+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
#
# and now some barplots to illustrate
# the importance of different factors
# in the models above 6 per page
#
par(mfrow=c(2,3),oma=c(0,0,0,0))
#
# Social Class
#
tt<-svyby(~SMOKER,~SC,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c('NK',1,2,3.1,3.2,4,5,'NA'),main='by social class',ylab='% cigarette smokers',ylim=c(0,60))
#
# area deprivation
#
tt<-svyby(~SMOKER,~CARSTG5,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c(1:5),main='by
5 Carstairs groups',ylab='% cigarette smokers',ylim=c(0,50))
#
# number of adults in household- first need to group large numbers together
# and this needs to be done by adding a variable to the design object
#
health.des<-update(health.des,NOFAD2=NOFAD*(NOFAD<=5)+5*(NOFAD>5))
tt<-svyby(~SMOKER,~NOFAD2,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c(1:4,'5+'),main='by
number of adults',ylab='% cigarette smokers',ylim=c(0,50))
#
# regions
#
tt<-svyby(~SMOKER,~REGION,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c('H&I','G&T','L&F','BD&G','Gla','LA&A','FVA'),main='by
regions',ylab='% cigarette smokers',ylim=c(0,50))
#
# age for men
#
tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER)
& SEX=='male'))
barplot(100*tt[,2],main='age groups men',ylab='%
cigarette smokers',names=17+5*(0:11),ylim=c(0,50))
#
# age groups women
#
tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER)&
SEX=='female'))
barplot(100*tt[,2],main='age groups
women',names=17+5*(0:11),ylab='% cigarette smokers',ylim=c(0,50))