##############################################

# HOW TO USE THIS FILE

# A text version of this file (ex3.R) is also available which can just be sourced

# into the R program

# This file is intended for you to use interactively by pasting into R

# Comments are in black - red bits are commands that can be pasted into R

######### ANY TEXT AFTER THE # CHARACTER ARE TREATED AS COMMENTS

# data stored in c:/GR/aprojects/peas/web/exemp3/data/ex3.Rdata

# this file is c:/GR/aprojects/peas/web/exemp3/program_code/ex3_R.html

##############################################

# data are in a data frame health

# to get the names of the variables

names(health)

# the cigarette smoking data is in a factor called health$CIGST1

# to get a table of this variable do

table(health$CIGST1)

# the next bit of code makes a new 1/0 variable for smokers

# and stores it in the data frame (health)

health$SMOKER<- (health$CIGST1=='Current cigarette smoker')

health$SMOKER[health$CIGST1=='Dont know']<-NA

health$SMOKER[health$CIGST1=='Refused/Not answered']<-NA

health$SMOKER[health$CIGST1=='schedule not obtained']<-NA
#

# to check that it has worked correctly

# you will see that we have set the result to the R missing value (NA)

# when question was not answered

table(health$CIGST1,health$SMOKER,exclude=NULL)

# now it is time to try using the survey package

# to give R access to the survey functions do

library(survey)

# The first step is to create a SURVEY DESIGN OBJECT that

# will contain both the data and the information about the design

# This design has WEIGHTS (WEIGHTA),

# it is clustered so it has PSUs (PSU)

# and it is stratified (implicitly) by deprivation category within region

# and the PSUs have been grouped into implicit strata (REGSTRAT)

# with just 2 or three PSUs per stratum

# There are TWO kinds of survey design

# 1. Using simple methods , usually for surveys WITHOUT POST-STRATIFICATION

# 2. Using replicate weights, usually for surveys with POST-STARTIFICATION

# The Scottish Health Survey STRATIFICATION and POST-STRATIFICATION so

# to allow for all features we should use functions of the second kind

# But first we set up a design to allow for clustering weighting and stratification

# that could be analysed by the simple methods

health.des <- svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health)

# see the html help file for the function svydesign for more explanation

# the ~ sign refers to it being one of the columns of the data

# the R object health.des now contains all the information about the design

# including the data

# To get details of the design

summary(health.des)

# now we can use the svymean command to get the smoking rate and its standard error

svymean(~SMOKER,health.des,deff=T)

# annoyingly this gives a missing result because of the missing values

# but we can get over this by using a command to restrict the survey design

# to the non-missing cases (!=not is.na identifies missing data)

svymean(~SMOKER,subset(health.des,!is.na(SMOKER)))

# this gives a rate of 0.332 (33.2%) and its standard error of 0.006 (0.6%)

# we can compare this with the corresponding unweighted mean

# which again has to be restricted to the unweighted cases

mean(health$SMOKER[!is.na(health$SMOKER)])

# this gives the higher rate of 34.79%

# The code below plots the illustration in the Web page for this exemplar

# It is restricted to adults under 65 to make it comparable with 1995 data

# To understand it fully you would need to learn more R from one of the help

# guides

rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$AGE<65])

wtmean<-print( svymean( ~SMOKER,subset(health.des,!is.na(SMOKER) & AGE<65)) )

par(mfrow=c(1,1))

plot(c(31,40),c(0,1),type='n',xlab='smoking rate (%)',ylab='',yaxt='n')
legend(31,1,c('weighted rate 95% C. I.','unweighted rate','1995 weighted rate'),

pch=c(15,16,17),col=c(1,2,3),bty='n',cex=c(1.5,1.5,1.5),lty=c(1,0,0))

points(rawmean*100,0.2,pch=16,cex=2,col=2)

lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.2,0.2))

points(wtmean[1]*100,0.2,pch=15,cex=2)

points(35,.2,pch=17,cex=2,col=3)

text(39,.2,'All')

# now by sex

rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='male'& health$AGE<65])

wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='male'& AGE<65) ) ) )

points(rawmean*100,0.4,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.4,0.4))
points(wtmean[1]*100,0.4,pch=15,cex=2)

points(34,.4,pch=17,cex=2,col=3)

text(39,.4,'male')

rawmean<-mean(health$SMOKER[!is.na(health$SMOKER) & health$SEX=='female'& health$AGE<65])

wtmean<-print( svymean( ~SMOKER,subset(health.des,(!is.na(SMOKER) & SEX=='female'& AGE<65) ) ) )

points(rawmean*100,0.6,pch=16,cex=2,col=2)
lines((wtmean[1]+c(-2,2)*wtmean[2])*100,c(0.6,0.6))
points(wtmean[1]*100,0.6,pch=15,cex=2)

points(36,.6,pch=17,cex=2,col=3)

text(39,.6,'Female')

# We can explore how different features affect the design effect by

# setting up different designs (see web page for this exemplar)

svymean(~SMOKER,subset(svydesign(id=~IDNO, weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)

svymean(~SMOKER,subset(svydesign(id=~PSU,

weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)

svymean(~SMOKER,subset(svydesign(id=~PSU, strata=~REGSTRAT,weights=~WEIGHTA,data=health),!is.na(SMOKER)),deff=T)

# the next stage is to convert it into a design with replicate weights

# we use the default options which produces, by default, jacknife samples

# the BRR method gives almost identical results

# see html help for as.svrepdesign

health.rep.des <- as.svrepdesign(health.des)

svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)))

# now we can post-stratify this by region and then by

# age and sex

# This will not affect the weights, since this is already included here

# but it MIGHT improve the precisio of estimates

tab.region <- xtabs(GROSSWT~REGION,data=health)

health.rep.des<-postStratify(health.rep.des,~REGION,tab.region)

svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)

tab.agesex <- xtabs(GROSSWT~SEX+AGEG,data=health)

health.rep.des<-postStratify(health.rep.des,~SEX+AGEG,tab.agesex)

svrepmean(~SMOKER,subset(health.rep.des,!is.na(SMOKER)),deff=T)

# In fact you will see that it does little to improve things

# now we look at the precision of estimates for regions and Health Boards

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lothian & Fife'))

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lothian'))

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Fife'))

summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lothian & Fife'),family='binomial'))

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & REGION=='Lanarkshire,Ayrshire & Arran'))

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Lanarkshire'))

svymean(~SMOKER,subset(health.des,!is.na(SMOKER) & HBOARD=='Ayrshire & Arran'))

summary(svyglm(SMOKER~HBOARD,design=subset(health.des,REGION=='Lanarkshire,Ayrshire & Arran'),family='binomial'))

# now some logistic regression models to explore influences on SMOKING

summary(svyglm(SMOKER~SEX+AGEG+REGION,design=subset(health.des,!is.na(SMOKER)),family='binomial'))

summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+SC+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))

summary(svyglm(SMOKER~SEX+AGEG+REGION+SEX*AGEG+CARSTG5+NOFAD,design=subset(health.des,!is.na(SMOKER)),family='binomial'))

# and now some barplots to illustrate

# the importance of different factors

# in the models above 6 per page

par(mfrow=c(2,3),oma=c(0,0,0,0))

# Social Class

tt<-svyby(~SMOKER,~SC,svymean,design=subset(health.des,!is.na(SMOKER)))

barplot(100*tt[,2],names=c('NK',1,2,3.1,3.2,4,5,'NA'),main='by social class',ylab='% cigarette smokers',ylim=c(0,60))

# area deprivation

tt<-svyby(~SMOKER,~CARSTG5,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c(1:5),main='by 5 Carstairs groups',ylab='% cigarette smokers',ylim=c(0,50))

# number of adults in household- first need to group large numbers together

# and this needs to be done by adding a variable to the design object

health.des<-update(health.des,NOFAD2=NOFAD*(NOFAD<=5)+5*(NOFAD>5))

tt<-svyby(~SMOKER,~NOFAD2,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c(1:4,'5+'),main='by number of adults',ylab='% cigarette smokers',ylim=c(0,50))

# regions

tt<-svyby(~SMOKER,~REGION,svymean,design=subset(health.des,!is.na(SMOKER)))
barplot(100*tt[,2],names=c('H&I','G&T','L&F','BD&G','Gla','LA&A','FVA'),main='by regions',ylab='% cigarette smokers',ylim=c(0,50))

# age for men

tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER) & SEX=='male'))
barplot(100*tt[,2],main='age groups men',ylab='% cigarette smokers',names=17+5*(0:11),ylim=c(0,50))

# age groups women

tt<-svyby(~SMOKER,~AGEG,svymean,design=subset(health.des,!is.na(SMOKER)& SEX=='female'))
barplot(100*tt[,2],main='age groups women',names=17+5*(0:11),ylab='% cigarette smokers',ylim=c(0,50))