library(DAAG)
data(frogs)
names(frogs)
Building the model
> fullmodel<-glm(pres.abs~altitude+distance+NoOfPools+NoOfSites+avrain+meanmin+meanmax,
data=frogs,family=binomial)
> summary(fullmodel)
AIC: 214.74
model1<-glm(pres.abs~distance+NoOfPools+meanmin,data=frogs,family=binomial)
AIC: 224.10
> anova(model1,fullmodel,test='Chisq')
Analysis of Deviance Table
Model 1: pres.abs
~ distance + NoOfPools + meanmin
Model 2: pres.abs
~ altitude + distance + NoOfPools + NoOfSites + avrain +
meanmin
+ meanmax
Resid. Df Resid.
Dev Df Deviance P(>|Chi|)
1 208
216.104
2 204
198.738 4 17.365
0.002
>
model2<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax,
data=frogs,family=binomial)
> summary(model2)
Null deviance: 279.99 on 211
degrees of freedom
Residual deviance: 199.63 on 207 degrees of freedom
AIC: 209.63
> anova(model2,fullmodel,test='Chisq')
Analysis of Deviance Table
Model 1: pres.abs
~ distance + NoOfPools + meanmin
+ meanmax
Model 2: pres.abs
~ altitude + distance + NoOfPools + NoOfSites + avrain +
meanmin
+ meanmax
Resid. Df Resid.
Dev Df Deviance P(>|Chi|)
1 207
199.627
2 204
198.738 3 0.889
0.828
Assessing Fit—Have we entered variables with the correct form?
Should be linearly related to the logit.
Method 1: Group x-variable. Within each group calculate logit. Plot logit versus group midpoint and check for linearity,.
--inefficient use of continuous variable
--dependent on how the grouping is done
Method 2: Refit mode replacing variable with a restricted cubic spline function. Plotting the estimated logit against the spline function can be used to assess linearity.
library(Design)
> rcspline.plot(y=frogs$pres.abs,x=frogs$meanmin,nk=5,m=20)->hmm2
> rcspline.plot(y=frogs$pres.abs,x=frogs$meanmax,nk=5,m=20)->hmm2
#Is there a quadratic effect?
> model2b<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax+I(meanmin^2)+I(meanmax^2),data=frogs,family=binomial)
> model2b<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax+I(meanmin^2),
data=frogs,family=binomial)
> summary(model2b)
> model2b<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax+I(meanmax^2),
data=frogs,family=binomial)
> summary(model2b)
#Examine other variables
> rcspline.plot(y=frogs$pres.abs,x=frogs$NoOfPools,nk=5,m=20)
> rcspline.plot(y=frogs$pres.abs,x=log(frogs$NoOfPools),nk=5,m=20)
> model2b<-glm(pres.abs~distance+log(NoOfPools)+meanmin+meanmax, data=frogs,family=binomial)
> rcspline.plot(y=frogs$pres.abs,x=frogs$distance,nk=4,m=20)
> rcspline.plot(y=frogs$pres.abs,x=log(frogs$distance),nk=4,m=20)
> model3<-glm(pres.abs~log(distance)+NoOfPools+meanmin+meanmax, data=frogs, family=binomial)
> summary(model3)
summary(model2)
> model3b<-glm(pres.abs~log(distance)+NoOfPools+meanmin+meanmax+I(meanmax^2), data=frogs,family=binomial)
> model3b
Stepwise Selection
Read Harrell p. 56.
stepAIC function in MASS library is better than most stepwise routines.
1. It uses AIC although if Burnham and Anderson were dead they’d turn over in their graves to hear me say that.
2. It obeys the principle of marginality, something virtually no other stewpwise program does.
a. We say x^2 is marginal to x and x is marginal to 1. If x^2 is in the model, we do not drop x or 1. We drop the marginal terms first.
b. Same holds for interactions. x1x2 is marginal to x1 and x2. We would not drop x1 while x1x2 is there.
c. factor variables are dropped en masse, we don’t drop individual dummy components.
stepAIC satisfies these conditions.
Totally mindless
library(MASS)
stepAIC(model3, scope=list(upper=~log(distance)*NoOfPools*meanmin*meanmax, lower=~1))
> model4<-glm(formula = pres.abs ~ log(distance) + NoOfPools
+ meanmin + meanmax +
log(distance):meanmin + meanmin:meanmax,
family = binomial, data = frogs)
> summary(model4)
> coef(model3)
(Intercept) log(distance) NoOfPools meanmin meanmax
19.64372120
-0.80747125 0.02728446 5.50919219
-2.40129292
> range(frogs$meanmin)
[1] 2.033333 4.333333
> range(log(frogs$distance))
[1] 5.521461 9.798127
Model Calibration
library(ROCR)
Sensitivity and Specificity versus c
> pred<-prediction(fitted(model3),frogs$pres.abs)
performance(pred,'tpr','tnr')->testy
plot(testy@alpha.values[[1]],testy@x.values[[1]],type='n')
lines(testy@alpha.values[[1]],testy@y.values[[1]])
lines(testy@alpha.values[[1]],testy@x.values[[1]])
ROC curve
> perf <- performance(pred,"tpr","fpr")
> plot(perf,col="grey82",lty=3)
Area under the curve
> lrm(pres.abs~log(distance)+NoOfPools+meanmin+meanmax,data=frogs)
Cross-validation
cv.binary(model3)
Rewrite the function to make cool plot
my.cvfunc<-function (obj = frogs.glm, rand = NULL, nfolds = 10, print.details = TRUE,seed=NULL)
{
data <- obj$data
m <- dim(data)[1]
if (is.null(seed))
{
if (is.null(rand))
rand <-
sample(nfolds, m, replace = TRUE)
}
else {
set.seed(seed)
if (is.null(rand))
rand <-
sample(nfolds, m, replace = TRUE)
}
form <- formula(obj)
yvar <- all.vars(form)[1]
obs <- data[, yvar]
ival <- unique(rand)
fam <- obj$family$family
hat <- predict(glm(form, data, family = fam), type = "response")
cvhat <- rep(0, length(rand))
if (print.details)
cat("\nFold: ")
my.out<-vector("list",10)
my.y<-vector("list",10)
for (i in ival) {
if (print.details)
cat("", i)
if (i%%20 == 0)
cat("\n")
here <- i != rand
i.glm <- glm(form, data = data[here, ], family = fam)
cvhat[!here] <- predict(i.glm, newdata = data[!here,
], family = fam, type = "response")
my.out[[i]]<-cvhat[!here]
my.y[[i]]<-data[!here,yvar]
}
list(my.out,my.y)
}
testit<-my.cvfunc(model3,seed=10)
pred
<- prediction(testit[[1]],testit[[2]])
perf
<- performance(pred,"tpr","fpr")
plot(perf,col="grey82",lty=3)
plot(perf,lwd=1,avg="vertical",spread.estimate="boxplot",add=TRUE)
abline(0,1,col=2,lty=3)
#Compare with model 4
testit<-my.cvfunc(model4,seed=10)
pred
<- prediction(testit[[1]],testit[[2]])
perf
<- performance(pred,"tpr","fpr")
plot(perf,col="grey82",lty=3)
plot(perf,lwd=1,avg="vertical",spread.estimate="boxplot",add=TRUE)
abline(0,1,col=2,lty=3)
par(mfrow=c(1,2))

More cross-validation
library(boot)
> cost <- function(r, pi=0) mean(abs(r-pi)>0.5)
> cv.glm(frogs,model3,cost,K=10)

library(Epi)
> ROC( form = pres.abs ~ log(distance) + NoOfPools + meanmin + meanmax, plot="ROC" )

