##############################################
#
# HOW TO USE THIS FILE
# A text version of this file (ex3.R) is also available which can just be sourced
# into the R program
# This file is intended for you to use interactively by pasting into R
# Comments are in black - red bits are commands that can be pasted into R
#
######### ANY TEXT AFTER
THE # CHARACTER ARE TREATED AS COMMENTS
#
# data stored in
c:/GR/aprojects/peas/web/exemp3/data/ex3.Rdata
# this file is
c:/GR/aprojects/peas/web/exemp3/program_code/ex3_R.html
#
##############################################
#
#
data are in a data frame health
# to get the names of the variables
#
names(health)
[1] "AGE" "SEX" "REGION" "HBOARD" "CARSTAIR" "CARSTG5" [7] "CIGST1" "CIGST2" "ARCHPT" "NOFAD" "SC" "WEIGHTA" [13] "GROSSWT" "IDNO" "PSUNO" "PSU" "MONTH" "QUARTER" [19] "STRATUM" "NIN" "REGSTRAT" "AGEG"
#
# the cigarette smoking data is in a factor called health$CIGST1
# to get a table of this variable do
#
table(health$CIGST1)
Refused/Not answered Dont know 14 16 schedule not obtained not applicable 3 3 Never smoked cigarettes at all Used to smoke cigarettes occasionally 3711 269 Used to smoke cigarettes regularly Current cigarette smoker 1895 3136
# the next bit of code makes a new 1/0 variable for smokers
# and stores it in the
data frame (health)
#
health$SMOKER<- (health$CIGST1=='Current cigarette smoker')
health$SMOKER[health$CIGST1=='Dont know']<-NA
health$SMOKER[health$CIGST1=='Refused/Not answered']<-NA
health$SMOKER[health$CIGST1=='schedule
not obtained']<-NA
#
# to check that it has worked correctly
# you will see that we have set the result to the R missing value (NA)
# when question was not answered
#
table(health$CIGST1,health$SMOKER,exclude=NULL)
FALSE TRUE <NA> Refused/Not answered 0 0 14 Dont know 0 0 16 schedule not obtained 0 0 3 not applicable 3 0 0 Never smoked cigarettes at all 3711 0 0 Used to smoke cigarettes occasionally 269 0 0 Used to smoke cigarettes regularly 1895 0 0 Current cigarette smoker 0 3136 0
#
# now it is time to try using the survey
package
# to give R access to the survey functions do
#
library(survey)
#
#
The first step is to create a SURVEY DESIGN OBJECT that
# will contain both the data and the information about the design
#
# This design has WEIGHTS (WEIGHTA),
# it is clustered so it has PSUs (PSU)
# and it is stratified (implicitly) by deprivation category within region
# and the PSUs have been grouped into implicit strata (REGSTRAT)
#
with just 2 or three PSUs per stratum
#
There are TWO kinds of survey design
#
# 1. Using simple methods , usually for
surveys WITHOUT POST-STRATIFICATION
# 2. Using replicate weights, usually for
surveys with POST-STARTIFICATION
#
# The Scottish Health Survey STRATIFICATION
and POST-STRATIFICATION so
# to allow for all features we should use functions of the second kind
# But first we set up a design to allow for clustering weighting and stratification
# that could be analysed by the simple methods
#
health.des <- svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health)
#
# see the html help file for the function svydesign for more explanation
# the ~ sign refers to it being one of the columns of the data
# the R object health.des now contains all the information about the design
# including the data
# To get details of the design
summary(health.des)
#
Stratified 1 - level Cluster Sampling design With ( 312 ) clusters. svydesign(id = ~PSU, strata = ~REGSTRAT, weights = ~WEIGHTA, data = health) Probabilities: Min. 1st Qu. Median Mean 3rd Qu. Max. 0.1260 0.7825 1.1090 1.3920 1.7710 15.3900 Stratum sizes: 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 201 202 obs 48 45 58 65 52 71 68 65 49 54 69 54 59 56 89 57 57 design.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 actual.PSU 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2
lines omitted
719 obs 51 design.PSU 2 actual.PSU 2 Data variables: [1] "AGE" "SEX" "REGION" "HBOARD" "CARSTAIR" "CARSTG5" [7] "CIGST1" "CIGST2" "ARCHPT" "NOFAD" "SC" "WEIGHTA" [13] "GROSSWT" "IDNO" "PSUNO" "PSU" "MONTH" "QUARTER" [19] "STRATUM" "NIN" "REGSTRAT" "AGEG" "SMOKER"
# now we can use the svymean command to get the smoking rate and its standard error
#
svymean(~SMOKER,health.des,deff=T)
mean SE DEff SMOKER NA NA NA
#
# annoyingly this gives a missing result because of the missing values
# but we can get over this by using a command to restrict the survey design
# to the non-missing cases (!=not is.na identifies missing data)
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER)))
mean SE SMOKER 0.33312 0.006
# this gives a rate of 0.332 (33.2%) and its standard error of 0.006 (0.6%)
# we can compare this with the corresponding unweighted mean
# which again has to be restricted to the unweighted cases
#
mean(health$SMOKER[!is.na(health$SMOKER)])
[1] 0.3479033
#
# this gives the higher rate of 34.79%
#
#
# We can explore how different features affect the design effect by
# setting up different designs (see web page for this exemplar)
#
svymean(~SMOKER,subset(svydesign(id=~IDNO, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0057008 1.3187
svymean(~SMOKER,subset(svydesign(id=~PSU,
weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0074961 2.28
svymean(~SMOKER,subset(svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0060102 1.4657
# the next stage is to convert it into a design with replicate weights
# we use the default options which produces, by default, jacknife samples
# the BRR method gives almost identical results
# see html help for as.svrepdesign
#
health.rep.des <- as.svrepdesign(health.des)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)))
mean SE SMOKER 0.33312 0.006
#
# now we can post-stratify this by region and then by
# age and sex
# This will not affect the weights, since this is already included here
# but it MIGHT improve the precisio of estimates
#
tab.region <- xtabs(GROSSWT~REGION,data=health)
health.rep.des<-postStratify(health.rep.des,~REGION,tab.region)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0060317 1.4762
tab.agesex <- xtabs(GROSSWT~SEX+AGEG,data=health)
health.rep.des<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex)
svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)
mean SE DEff SMOKER 0.3331194 0.0059543 1.4386
#
# In fact you will see that it does little to improve things
#
# now we look at the precision of estimates for regions and Health Boards
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife'))
mean SE SMOKER 0.32138 0.0114
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian'))
mean SE SMOKER 0.30387 0.0152
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife'))
mean SE SMOKER 0.35144 0.018
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial'))
Call: svyglm(formula = SMOKER ~ HBOARD, design = subset(health.des, REGION == "Lothian & Fife"), family = "binomial")
Survey design: subset.survey.design(health.des, REGION == "Lothian & Fife")
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.61273 0.07894 -7.762 1.38e-14 *** HBOARDLothian -0.21622 0.10769 -2.008 0.0448 * --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for binomial family taken to be 0.9959916)
Number of Fisher Scoring iterations: 4
#
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran'))
mean SE SMOKER 0.34259 0.0131
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire'))
mean SE SMOKER 0.34435 0.0187
svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran'))
mean SE SMOKER 0.34052 0.0249
summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial'))
Call: svyglm(formula = SMOKER ~ HBOARD, design = subset(health.des, REGION == "Lanarkshire,Ayrshire & Arran"), family = "binomial") Survey design: subset.survey.design(health.des, REGION == "Lanarkshire,Ayrshire & Arran") Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.66097 0.11088 -5.961 3.08e-09 *** HBOARDLanarkshire 0.01702 0.15521 0.110 0.913 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 0.990106) Number of Fisher Scoring iterations: 3
#
# now some logistic regression models to explore influences on SMOKING
#
summary(svyglm(SMOKER~SEX+AGEG+REGION,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
Call: svyglm(formula = SMOKER ~ SEX + AGEG + REGION, design = subset(health.des, !is.na(SMOKER)), family = "binomial") Survey design: subset.survey.design(health.des, !is.na(SMOKER)) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.92027 0.16836 -5.466 4.73e-08 *** SEXfemale -0.06587 0.05076 -1.298 0.194410 AGEG25-29 0.61030 0.17973 3.396 0.000688 *** AGEG35-39 0.36622 0.14929 2.453 0.014179 * AGEG45-49 0.49244 0.15146 3.251 0.001153 ** AGEG55-59 0.25062 0.14305 1.752 0.079799 . AGEG65-69 0.32661 0.15307 2.134 0.032887 * AGEG70-74 0.25268 0.16417 1.539 0.123806 AGEG60-64 0.29999 0.14616 2.052 0.040152 * AGEG50-54 0.29326 0.15695 1.868 0.061731 . AGEG40-44 0.03624 0.15342 0.236 0.813274 AGEG30-34 -0.19737 0.16570 -1.191 0.233641 AGEG20-24 -0.39945 0.15669 -2.549 0.010812 * REGIONGrampian & Tayside -0.01852 0.13501 -0.137 0.890867 REGIONLothian & Fife -0.05794 0.12436 -0.466 0.641314 REGIONBorders, Dumfries & Galloway -0.16581 0.15708 -1.056 0.291182 REGIONGlagow 0.14473 0.13502 1.072 0.283761 REGIONLanarkshire,Ayrshire & Arran 0.05118 0.12561 0.407 0.683698 REGIONForth Valley, Argyll & Clyde -0.01661 0.13307 -0.125 0.900662 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 0.9999929) Number of Fisher Scoring iterations: 4
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+SC+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
Call: svyglm(formula = SMOKER ~ SEX + AGEG + REGION + SEX * AGEG + SC + NOFAD, design = subset(health.des, !is.na(SMOKER)), family = "binomial") Survey design: subset.survey.design(health.des, !is.na(SMOKER)) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.68445 0.27064 -2.529 0.0115 * SEXfemale 0.26976 0.27989 0.964 0.3352 AGEG25-29 0.67066 0.29497 2.274 0.0230 * AGEG35-39 -0.21431 0.22495 -0.953 0.3408 AGEG45-49 0.04875 0.24178 0.202 0.8402 AGEG55-59 -0.22199 0.23247 -0.955 0.3396 AGEG65-69 -0.10241 0.24130 -0.424 0.6713 AGEG70-74 -0.15697 0.24767 -0.634 0.5262 AGEG60-64 -0.22489 0.27364 -0.822 0.4112 AGEG50-54 -0.47543 0.24051 -1.977 0.0481 * AGEG40-44 -0.40650 0.26823 -1.515 0.1297 AGEG30-34 -1.01923 0.25325 -4.025 5.75e-05 *** AGEG20-24 -1.27419 0.24841 -5.129 2.97e-07 *** REGIONGrampian & Tayside 0.02349 0.14229 0.165 0.8689 REGIONLothian & Fife -0.03669 0.13112 -0.280 0.7796 REGIONBorders, Dumfries & Galloway -0.19086 0.16273 -1.173 0.2409 REGIONGlagow 0.21274 0.13806 1.541 0.1234 REGIONLanarkshire,Ayrshire & Arran 0.07497 0.12886 0.582 0.5607 REGIONForth Valley, Argyll & Clyde -0.01536 0.13601 -0.113 0.9101 SC 0.29678 0.02380 12.471 < 2e-16 *** NOFAD -0.27962 0.05004 -5.588 2.37e-08 *** SEXfemale:AGEG25-29 -0.88269 0.38027 -2.321 0.0203 * SEXfemale:AGEG35-39 -0.26478 0.32817 -0.807 0.4198 SEXfemale:AGEG45-49 -0.67796 0.30189 -2.246 0.0247 * SEXfemale:AGEG55-59 -0.48718 0.30300 -1.608 0.1079 SEXfemale:AGEG65-69 -0.34475 0.31941 -1.079 0.2805 SEXfemale:AGEG70-74 -0.40652 0.32935 -1.234 0.2171 SEXfemale:AGEG60-64 -0.32320 0.36035 -0.897 0.3698 SEXfemale:AGEG50-54 -0.01855 0.33280 -0.056 0.9555 SEXfemale:AGEG40-44 -0.76932 0.34928 -2.203 0.0277 * SEXfemale:AGEG30-34 -0.10401 0.33665 -0.309 0.7574 SEXfemale:AGEG20-24 -0.14086 0.34942 -0.403 0.6869 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+CARSTG5+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))
Call: svyglm(formula = SMOKER ~ SEX + AGEG + REGION + SEX * AGEG + CARSTG5 + NOFAD, design = subset(health.des, !is.na(SMOKER)), family = "binomial") Survey design: subset.survey.design(health.des, !is.na(SMOKER)) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.84591 0.25880 -3.269 0.001085 ** SEXfemale 0.24681 0.27504 0.897 0.369558 AGEG25-29 0.92319 0.28223 3.271 0.001075 ** AGEG35-39 0.17917 0.21854 0.820 0.412336 AGEG45-49 0.47653 0.23109 2.062 0.039225 * AGEG55-59 0.18258 0.22250 0.821 0.411899 AGEG65-69 0.26896 0.22884 1.175 0.239895 AGEG70-74 0.26452 0.23738 1.114 0.265161 AGEG60-64 0.18358 0.25527 0.719 0.472065 AGEG50-54 -0.04392 0.23055 -0.191 0.848918 AGEG40-44 0.03001 0.25316 0.119 0.905640 AGEG30-34 -0.62017 0.23695 -2.617 0.008879 ** AGEG20-24 -0.84079 0.24086 -3.491 0.000484 *** REGIONGrampian & Tayside 0.01935 0.14091 0.137 0.890772 REGIONLothian & Fife -0.13188 0.12894 -1.023 0.306433 REGIONBorders, Dumfries & Galloway -0.15780 0.16500 -0.956 0.338918 REGIONGlagow -0.13488 0.14070 -0.959 0.337769 REGIONLanarkshire,Ayrshire & Arran -0.14715 0.13507 -1.089 0.275982 REGIONForth Valley, Argyll & Clyde -0.17572 0.13850 -1.269 0.204573 CARSTG5 0.23819 0.02199 10.829 < 2e-16 *** NOFAD -0.26871 0.04293 -6.260 4.03e-10 *** SEXfemale:AGEG25-29 -0.84820 0.37402 -2.268 0.023364 * SEXfemale:AGEG35-39 -0.28549 0.32528 -0.878 0.380147 SEXfemale:AGEG45-49 -0.63784 0.29589 -2.156 0.031137 * SEXfemale:AGEG55-59 -0.47193 0.30514 -1.547 0.121992 SEXfemale:AGEG65-69 -0.28833 0.31546 -0.914 0.360747 SEXfemale:AGEG70-74 -0.33610 0.32837 -1.024 0.306081 SEXfemale:AGEG60-64 -0.22750 0.34888 -0.652 0.514368 SEXfemale:AGEG50-54 0.02436 0.33317 0.073 0.941724 SEXfemale:AGEG40-44 -0.72016 0.34619 -2.080 0.037533 * SEXfemale:AGEG30-34 0.01953 0.32453 0.060 0.952011 SEXfemale:AGEG20-24 -0.07382 0.34487 -0.214 0.830509 --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 1.000655)