Lab 9---Logistic Regression
#grouped binary data
> library(faraway)
> data(bliss)
> bliss
dead alive conc
1 2 28
0
2 8 22
1
3 15 15 2
4 23 7
3
5 27 3
4
#fit model
> out1<-glm(cbind(dead,alive)~conc,data=bliss,family=binomial)
> summary(out1)
#analysis of deviance--need to specify the test
> anova(out1)
> anova(out1,type='Chi')
#interpret the coefficient
> coef(out1)
(Intercept)
conc
-2.323790 1.161895
> exp(coef(out1)[2])
conc
3.195984
#odds of death increased by
a factor of 3.2 by each one unit increase in conc.
#graph results
#empirical logit
logit.p<-log((bliss$dead+1/2)/(bliss$alive+1/2))
plot(bliss$conc,logit.p,xlab='concentration',ylab='logit(p)')
abline(coef(out1),col=2)

#on probability scale
plot(bliss$conc,bliss$dead/(bliss$dead+bliss$alive),xlab='Concentration',
ylab='probability')
lines(seq(0,4,.1),1/(1+exp(-coef(out1)[1]-coef(out1)[2]*seq(0,4,.1))),col=2)

#check for lack of fit
> bliss
dead alive conc
1 2 28 0
2 8 22
1
3 15 15 2
4 23 7
3
5 27 3
4
#obtain group sample sizes
> ni<-apply(bliss[,1:2],1,sum)
> ni
1
2 3 4 5
30 30 30
30 30
#calculate predicted probabilities
> 1/(1+exp(-coef(out1)[1]-coef(out1)[2]*0:4))
[1] 0.08917177 0.23832314 0.50000000 0.76167686
0.91082823
> 1/(1+exp(-coef(out1)[1]-coef(out1)[2]*0:4))->phat
#calculate expected values
> cbind(ni*phat,ni-ni*phat)->Ei
> Ei
[,1] [,2]
1 2.675153 27.324847
2 7.149694 22.850306
3 15.000000 15.000000
4 22.850306 7.149694
5 27.324847 2.675153
#obtain Oi
and calculate Pearson
> Oi<-bliss[,1:2]
> pearson<-sum((Oi-Ei)^2/Ei)
> pearson
[1] 0.3672674
> 1-pchisq(pearson,dim(bliss)[1]-2)
[1] 0.9469181
#compare to sum of pearson residuals squared
> residuals(out1,type='pearson')
1 2 3 4 5
-0.43252342 0.36437292 0.00000000
0.06414687 -0.20810684
> sum(residuals(out1,type='pearson')^2)
[1] 0.3672674
#try G2 test
> sum(2*Oi*log(Oi/Ei))
[1] 0.3787483
#look at sum of deviance residuals squared
> sum(residuals(out1,type='deviance')^2)
[1] 0.3787483
#look at output--the residual deviance is the G2
goodness of fit test.
> summary(out1)
Ungrouped
Binary Data--Habitat suitability modeling
http://www.kidcyber.com.au/topics/frog_corrob.htm
http://www.arkive.org/species/GES/amphibians/Pseudophryne_corroboree/GES005645.html?size=large
http://www.kidcyber.com.au/topics/frog_corrob.htm
http://www.abc.net.au/science/scribblygum/june2004/
Hunter,
D. (2000) The conservation and demography of the southern
corroboree frog (Pseudophryne
corroboree). M.Sc. thesis,
library(DAAG)
data(frogs)
names(frogs)
> help(frogs)
#examine data
#data have a spatial component
> plot(frogs$easting,frogs$northing,pch=1,col=2*(frogs$pres.abs+1))

#let's look at one predictor meanmin
> plot(frogs$meanmin,frogs$pres.abs)
#perhaps quadratic?
> lines(lowess(frogs$pres.abs~frogs$meanmin),col=1,lty=3)
> lines(lowess(frogs$pres.abs~frogs$meanmin,f=.9),col=2)

#fit model
> glm(pres.abs~meanmin,data=frogs,family=binomial)->out0
#plot result
> plot(frogs$meanmin,frogs$pres.abs)
> lines(lowess(frogs$pres.abs~frogs$meanmin),col=1,lty=3)
#add model
> phat<-function(x)
1/(1+exp(-coef(out0)[1]-coef(out0)[2]*x))
> lines(seq(2,4.5,.01),phat(seq(2,4.5,.01)),col=2)
#add a quadratic
> glm(pres.abs~meanmin+I(meanmin^2),data=frogs,family=binomial)->out0b
> phat2<-function(x) 1/(1+exp(-coef(out0b)[1]-coef(out0b)[2]*x-coef(out0b)[3]*x^2))
#plot results
> plot(frogs$meanmin,frogs$pres.abs)
> lines(seq(2,4.5,.01),phat(seq(2,4.5,.01)),col=2)
> lines(seq(2,4.5,.01),phat2(seq(2,4.5,.01)),col=4)
> lines(lowess(frogs$pres.abs~frogs$meanmin),col=1,lty=3)

#goodness of fit--grouping
x-values
#not many repeated values
>table(frogs$meanmin)
> sum(table(frogs$meanmin))
[1] 212
#perhaps ten categories?
> quantile(frogs$meanmin,seq(0,1,.2))
0% 20%
40% 60% 80%
100%
2.033333 2.500000 2.866667 3.253333 3.633333
4.333333
#lets just approximate this
> cut(frogs$meanmin,seq(2,4.5,.5))->test.cuts
> table(test.cuts)
test.cuts
(2,2.5] (2.5,3] (3,3.5]
(3.5,4] (4,4.5]
48 59
49 24 32
> table(frogs$pres.abs,test.cuts)
test.cuts
(2,2.5] (2.5,3] (3,3.5] (3.5,4] (4,4.5]
0 42
44 23 8
16
1
6 15 26
16 16
#now use quantiles
> cut(frogs$meanmin,quantile(frogs$meanmin,seq(0,1,.2)),include.lowest=TRUE)->test.cuts2
> sum(table(test.cuts2))
[1] 212
> table(frogs$pres.abs,test.cuts2)
test.cuts2
[2.03,2.5] (2.5,2.87] (2.87,3.25] (3.25,3.63] (3.63,4.33]
0 42 32 23 14 22
1 6 7 17 29 20
#calculate n*pi
> tapply(fitted(out0b),test.cuts2,sum)
[2.03,2.5] (2.5,2.87]
(2.87,3.25] (3.25,3.63] (3.63,4.33]
4.117692 10.161348 17.619857
24.697414 22.403690
#obtain cell counts, the ni
>apply(table(frogs$pres.abs,test.cuts2),2,sum)
[2.03,2.5] (2.5,2.87]
(2.87,3.25] (3.25,3.63] (3.63,4.33]
48 39 40
43 42
> apply(table(frogs$pres.abs,test.cuts2),2,sum)-np->fail
> table(frogs$pres.abs,test.cuts2)->Oi
> rbind(fail,np)->Ei
> Ei
[2.03,2.5] (2.5,2.87] (2.87,3.25] (3.25,3.63] (3.63,4.33]
fail 43.882308
28.83865 22.38014 18.30259
19.59631
np 4.117692
10.16135 17.61986 24.69741
22.40369
> Oi
test.cuts2
[2.03,2.5]
(2.5,2.87] (2.87,3.25] (3.25,3.63] (3.63,4.33]
0
42 32 23 14 22
1
6 7 17 29 20
> sum((Oi-Ei)^2/Ei)
[1] 4.624012
> 1-pchisq(sum((Oi-Ei)^2/Ei),df=5-3)
[1] 0.09906236
#was the quadratic term necessary?
#I go back and change the model
> tapply(fitted(out0),test.cuts2,sum)->np2
> apply(table(frogs$pres.abs,test.cuts2),2,sum)-np2->fail2
> rbind(fail2,np2)->Ei2
> Ei2
[2.03,2.5] (2.5,2.87] (2.87,3.25] (3.25,3.63] (3.63,4.33]
fail2 39.278011 28.83784
26.30426 23.55765 15.02225
np2
8.721989 10.16216 13.69574
19.44235 26.97775
> sum((Oi-Ei2)^2/Ei2)
[1] 17.20310
> 1-pchisq(sum((Oi-Ei2)^2/Ei2),df=3)
[1] 0.0006419176
#One problem: with the more
than one predictor, this method becomes hopeless
#Hosmer-Lemeshow
Test
#let cut decide
> p.groups<-cut(fitted(out0b),breaks=10)
> table(p.groups)
p.groups
(0.018,0.0769] (0.0769,0.136] (0.136,0.195]
(0.195,0.254] (0.254,0.313] (0.313,0.372] (0.372,0.43]
19 29 8 6 13 14 18
(0.43,0.489] (0.489,0.548] (0.548,0.607]
17 45 43
#pick your own
> range(fitted(out0b))
[1] 0.01860714 0.60661375
> p.groups2<-cut(fitted(out0b),seq(0,0.66,.066))
> table(p.groups2)
p.groups2
(0,0.066]
(0.066,0.132] (0.132,0.198] (0.198,0.264]
(0.264,0.33] (0.33,0.396]
(0.396,0.462]
18 24 14 10 13 22 11
(0.462,0.528]
(0.528,0.594] (0.594,0.66]
33 45 22
#use quantiles
> p.groups3<-cut(fitted(out0b),quantile(fitted(out0b),seq(0,1,.1)),include.lowest=TRUE)
> table(p.groups3)
p.groups3
[0.0186,0.0956] (0.0956,0.136] (0.136,0.259] (0.259,0.338] (0.338,0.414] (0.414,0.498]
29 19 18 21 20 27
(0.498,0.533]
(0.533,0.552] (0.552,0.595] (0.595,0.607]
17 18 21 22
> table(p.groups3)
p.groups3
[0.0186,0.0956] (0.0956,0.136] (0.136,0.259] (0.259,0.338] (0.338,0.414] (0.414,0.498]
29 19 18 21 20 27
(0.498,0.533]
(0.533,0.552] (0.552,0.595] (0.595,0.607]
17 18 21 22
#obtain observed counts in logit
deciles
> table(frogs$pres.abs,p.groups3)
p.groups3
[0.0186,0.0956] (0.0956,0.136] (0.136,0.259] (0.259,0.338]
(0.338,0.414] (0.414,0.498] (0.498,0.533]
0 27 15
15
17 12 17 10
1 2 4 3 4 8 10 7
p.groups3
(0.533,0.552] (0.552,0.595] (0.595,0.607]
0 7 6 7
1 11 15 15
> table(frogs$pres.abs,p.groups3)->Oi
#obtain expected successes in each category
> tapply(fitted(out0b),p.groups3,sum)
[0.0186,0.0956] (0.0956,0.136] (0.136,0.259] (0.259,0.338] (0.338,0.414] (0.414,0.498]
1.794265 2.323426 3.621496 6.539852 7.903507 12.886003
(0.498,0.533]
(0.533,0.552] (0.552,0.595] (0.595,0.607]
8.829664 9.750304 12.098450 13.253032
#obtain total counts in each category
> apply(Oi,2,sum)->ni
> ni
[0.0186,0.0956] (0.0956,0.136] (0.136,0.259] (0.259,0.338] (0.338,0.414] (0.414,0.498]
29
19 18 21 20 27
(0.498,0.533]
(0.533,0.552] (0.552,0.595] (0.595,0.607]
17 18 21 22
#obtain expected failures and successes
> rbind(ni-tapply(fitted(out0b),p.groups3,sum),tapply(fitted(out0b),p.groups3,sum))->Ei
#carry out H-L Test
> sum((Oi-Ei)^2/Ei)
[1] 7.567172
#df
is g-2 always, where g is the number of deciles
> 1-pchisq(sum((Oi-Ei)^2/Ei),df=8)
[1] 0.4768487
"The H-L test is largely obsolete, having been
replaced by specific directed
tests of lack of fit and by the
new test described in"
#an alternative test
> library(Design)
> meanmin2<-frogs$meanmin^2
> out.harrell<-lrm(pres.abs~meanmin+meanmin2,data=frogs,x=TRUE,y=TRUE)
> residuals.lrm(out.harrell,type='gof')
Sum of squared errors Expected value|H0 SD Z P
41.2834597
41.6627336
0.3486647 -1.0877900 0.2766878
>