library(DAAG)

data(frogs)

names(frogs)

 

Building the model

 

> fullmodel<-glm(pres.abs~altitude+distance+NoOfPools+NoOfSites+avrain+meanmin+meanmax, data=frogs,family=binomial)

> summary(fullmodel)

AIC: 214.74

model1<-glm(pres.abs~distance+NoOfPools+meanmin,data=frogs,family=binomial)

AIC: 224.10

 

> anova(model1,fullmodel,test='Chisq')

Analysis of Deviance Table

 

Model 1: pres.abs ~ distance + NoOfPools + meanmin

Model 2: pres.abs ~ altitude + distance + NoOfPools + NoOfSites + avrain +

    meanmin + meanmax

  Resid. Df Resid. Dev  Df Deviance P(>|Chi|)

1       208    216.104                      

2       204    198.738   4   17.365     0.002

 

> model2<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax, data=frogs,family=binomial)

> summary(model2)

    Null deviance: 279.99  on 211  degrees of freedom

Residual deviance: 199.63  on 207  degrees of freedom

AIC: 209.63

 

> anova(model2,fullmodel,test='Chisq')

Analysis of Deviance Table

 

Model 1: pres.abs ~ distance + NoOfPools + meanmin + meanmax

Model 2: pres.abs ~ altitude + distance + NoOfPools + NoOfSites + avrain +

    meanmin + meanmax

  Resid. Df Resid. Dev  Df Deviance P(>|Chi|)

1       207    199.627                      

2       204    198.738   3    0.889     0.828

 

 

Assessing Fit—Have we entered variables with the correct form?

Should be linearly related to the logit.

 

Method 1: Group x-variable. Within each group calculate logit. Plot logit versus group midpoint and check for linearity,.

            --inefficient use of continuous variable

            --dependent on how the grouping is done

 

Method 2: Refit mode replacing variable with a restricted cubic spline function. Plotting the estimated logit against the spline function can be used to assess linearity.

 

library(Design)

 

> rcspline.plot(y=frogs$pres.abs,x=frogs$meanmin,nk=5,m=20)->hmm2

> rcspline.plot(y=frogs$pres.abs,x=frogs$meanmax,nk=5,m=20)->hmm2

 

#Is there a quadratic effect?

> model2b<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax+I(meanmin^2)+I(meanmax^2),data=frogs,family=binomial)

> model2b<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax+I(meanmin^2), data=frogs,family=binomial)

> summary(model2b)

> model2b<-glm(pres.abs~distance+NoOfPools+meanmin+meanmax+I(meanmax^2), data=frogs,family=binomial)

> summary(model2b)

 

#Examine other variables

> rcspline.plot(y=frogs$pres.abs,x=frogs$NoOfPools,nk=5,m=20)

> rcspline.plot(y=frogs$pres.abs,x=log(frogs$NoOfPools),nk=5,m=20)

 

> model2b<-glm(pres.abs~distance+log(NoOfPools)+meanmin+meanmax, data=frogs,family=binomial)

 

 

> rcspline.plot(y=frogs$pres.abs,x=frogs$distance,nk=4,m=20)

> rcspline.plot(y=frogs$pres.abs,x=log(frogs$distance),nk=4,m=20)

 

> model3<-glm(pres.abs~log(distance)+NoOfPools+meanmin+meanmax, data=frogs, family=binomial)

> summary(model3)

summary(model2)

 

> model3b<-glm(pres.abs~log(distance)+NoOfPools+meanmin+meanmax+I(meanmax^2), data=frogs,family=binomial)

> model3b

 

Stepwise Selection

 

Read Harrell p. 56.

 

stepAIC function in MASS library is better than most stepwise routines.

 

1. It uses AIC although if Burnham and Anderson were dead they’d turn over in their graves to hear me say that.

2. It obeys the principle of marginality, something virtually no other stewpwise program does.

            a. We say x^2 is marginal to x and x is marginal to 1. If x^2 is in the model, we do not drop x or 1. We drop the marginal terms first.

            b. Same holds for interactions. x1x2 is marginal to x1 and x2. We would not drop x1 while x1x2 is there.

            c. factor variables are dropped en masse, we don’t drop individual dummy components.

stepAIC satisfies these conditions.

 

Totally mindless

 

library(MASS)

 

stepAIC(model3, scope=list(upper=~log(distance)*NoOfPools*meanmin*meanmax, lower=~1))

 

> model4<-glm(formula = pres.abs ~ log(distance) + NoOfPools + meanmin + meanmax + log(distance):meanmin + meanmin:meanmax, family = binomial,      data = frogs)

> summary(model4)

> coef(model3)

  (Intercept) log(distance)     NoOfPools       meanmin       meanmax

  19.64372120   -0.80747125    0.02728446    5.50919219   -2.40129292

> range(frogs$meanmin)

[1] 2.033333 4.333333

> range(log(frogs$distance))

[1] 5.521461 9.798127

 

Model Calibration

 

library(ROCR)

 

Sensitivity and Specificity versus c

 

> pred<-prediction(fitted(model3),frogs$pres.abs)

 

performance(pred,'tpr','tnr')->testy

plot(testy@alpha.values[[1]],testy@x.values[[1]],type='n')

lines(testy@alpha.values[[1]],testy@y.values[[1]])

lines(testy@alpha.values[[1]],testy@x.values[[1]])

 

ROC curve

 

> perf <- performance(pred,"tpr","fpr")

> plot(perf,col="grey82",lty=3)

 

Area under the curve

 

> lrm(pres.abs~log(distance)+NoOfPools+meanmin+meanmax,data=frogs)

 

 

Cross-validation

 

cv.binary(model3)

 

Rewrite the function to make cool plot

 

my.cvfunc<-function (obj = frogs.glm, rand = NULL, nfolds = 10, print.details = TRUE,seed=NULL)

{

    data <- obj$data

    m <- dim(data)[1]

    if (is.null(seed))

    {

    if (is.null(rand))

        rand <- sample(nfolds, m, replace = TRUE)

    }

    else {

        set.seed(seed)

        if (is.null(rand))

        rand <- sample(nfolds, m, replace = TRUE)

    }

    

    form <- formula(obj)

    yvar <- all.vars(form)[1]

    obs <- data[, yvar]

    ival <- unique(rand)

    fam <- obj$family$family

    hat <- predict(glm(form, data, family = fam), type = "response")

    cvhat <- rep(0, length(rand))

    if (print.details)

        cat("\nFold: ")

    my.out<-vector("list",10)

    my.y<-vector("list",10)

    for (i in ival) {

        if (print.details)

            cat("", i)

        if (i%%20 == 0)

            cat("\n")

        here <- i != rand

        i.glm <- glm(form, data = data[here, ], family = fam)

        cvhat[!here] <- predict(i.glm, newdata = data[!here,

            ], family = fam, type = "response")

        my.out[[i]]<-cvhat[!here]

        my.y[[i]]<-data[!here,yvar]     

    }

list(my.out,my.y)

}

 

testit<-my.cvfunc(model3,seed=10)

pred <- prediction(testit[[1]],testit[[2]])

perf <- performance(pred,"tpr","fpr")

plot(perf,col="grey82",lty=3)

plot(perf,lwd=1,avg="vertical",spread.estimate="boxplot",add=TRUE)

abline(0,1,col=2,lty=3)

 

#Compare with model 4

 

testit<-my.cvfunc(model4,seed=10)

pred <- prediction(testit[[1]],testit[[2]])

perf <- performance(pred,"tpr","fpr")

plot(perf,col="grey82",lty=3)

plot(perf,lwd=1,avg="vertical",spread.estimate="boxplot",add=TRUE)

abline(0,1,col=2,lty=3)

 

par(mfrow=c(1,2))

 

 

 

More cross-validation

 

library(boot)

> cost <- function(r, pi=0) mean(abs(r-pi)>0.5)

> cv.glm(frogs,model3,cost,K=10)

 

library(Epi)

> ROC( form = pres.abs ~ log(distance) + NoOfPools + meanmin +      meanmax, plot="ROC" )