Logistic Regression

Plot of \(\pi(x)\) functions
Exploratory of Crab Data
Logistic regression
GLM using probit link
Grouped and ungrouped data
Links:

Plot of \(\pi(x)\) functions

x = runif(10000, -50,50)


par(mfrow=c(2,2))
plot(x,exp(20+2*x)/(1+exp(20+2*x)),  
     main = substitute(paste(alpha, "=20 and ", beta, " = 2",sep="")), 
     xlab = "x", ylab=expression(pi(x)))
plot(x,exp(2+2*x)/(1+exp(2+2*x)),  
     main = substitute(paste(alpha, "=2 and ", beta, " = 2",sep="")), 
     xlab = "x", ylab=expression(pi(x)))
plot(x,exp(2+0.2*x)/(1+exp(2+0.2*x)),  
     main = substitute(paste(alpha, "=2 and ", beta, " = 0.2",sep="")), 
     xlab = "x", ylab=expression(pi(x)))
plot(x,exp(2+20*x)/(1+exp(2+20*x)),  
     main = substitute(paste(alpha, "=2 and ", beta, " = 20",sep="")), 
     xlab = "x", ylab=expression(pi(x)))

par(mfrow=c(2,2))
plot(x,exp(2+2*x)/(1+exp(2+2*x)),  
     main = substitute(paste(alpha, "=2 and ", beta, " = 2",sep="")), 
     xlab = "x", ylab=expression(pi(x)))
plot(x,exp(2-2*x)/(1+exp(2-2*x)),  
     main = substitute(paste(alpha, "=2 and ", beta, " = -2",sep="")), 
     xlab = "x", ylab=expression(pi(x)))
plot(x,exp(2-0.2*x)/(1+exp(2-0.2*x)),  
     main = substitute(paste(alpha, "=2 and ", beta, " = -0.2",sep="")), 
     xlab = "x", ylab=expression(pi(x)))
plot(x,exp(2+0*x)/(1+exp(2+0*x)),  
     main = substitute(paste(alpha, "=2 and ", beta, " = 0",sep="")), 
     xlab = "x", ylab=expression(pi(x)))

[Data] Crab Data: comes from a study of nesting horseshoe crabs (J. Brockmann, Ethology, 102: 1-21, 1996). Each female horseshoe crab in the study had a male crab attached to her in her nest. The study investigated factors that affect whether the female crab had any other males, called satellites, residing nearby her.

The response outcome for each female crab is whether or not having satellites. The value is 0 if there is no any satellites and 1 if at least one satellites.
An explanatory variable thought possibly to affect this was the female crab’s shell width, which is a summary of her size.

[Goal] Associate the probability of having at least one satellite with the width of the female crab.

Exploratory of Crab Data

crab = read.csv("../data/crab.csv",header=TRUE)
colnames(crab)

## [1] "Obs"        "Color"      "Spine"      "Width"      "Weight"    
## [6] "Satellites"

flag = as.numeric(crab[,"Satellites"]>0)
crab = cbind(crab,flag)
head(crab)

##   Obs Color Spine Width Weight Satellites flag
## 1   1     2     3  28.3   3.05          8    1
## 2   2     3     3  26.0   2.60          4    1
## 3   3     3     3  25.6   2.15          0    0
## 4   4     4     2  21.0   1.85          0    0
## 5   5     2     3  29.0   3.00          1    1
## 6   6     1     2  25.0   2.30          3    1

plot(crab[,"Width"], crab[,"flag"], xlab = "Width", ylab="Flag if having at least one satellite")

upper.endpoint = c(23.25, 24.25,25.25,26.25,27.25,28.25,29.25,(max(crab$Width)+0.1))
width.interval =cut(crab$Width, c(min(crab$Width)-0.1,upper.endpoint))
crab = cbind(crab,width.interval)

library(plyr) 
prop.flag.byinterval  = ddply(crab,~width.interval,summarise,mean=mean(flag))$mean
mean.width.byinterval = ddply(crab,~width.interval,summarise,mean=mean(Width))$mean

plot(crab[,"Width"], crab[,"flag"], xlab = "Width", ylab="Flag if having at least one satellite")
points(mean.width.byinterval,prop.flag.byinterval,pch=19,col="blue")
lines(lowess(mean.width.byinterval,prop.flag.byinterval))

Logistic regression

# The default is log link.
fit.logit = glm(flag ~ Width, family=binomial, data=crab)
summary(fit.logit)

## 
## Call:
## glm(formula = flag ~ Width, family = binomial, data = crab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0281  -1.0458   0.5480   0.9066   1.6942  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.3508     2.6287  -4.698 2.62e-06 ***
## Width         0.4972     0.1017   4.887 1.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 194.45  on 171  degrees of freedom
## AIC: 198.45
## 
## Number of Fisher Scoring iterations: 4

GLM using probit link

fit.probit = glm(flag ~ Width, family=binomial(link = "probit"), data=crab)
summary(fit.probit)

## 
## Call:
## glm(formula = flag ~ Width, family = binomial(link = "probit"), 
##     data = crab)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0519  -1.0494   0.5374   0.9126   1.6897  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.50196    1.50712  -4.978 6.44e-07 ***
## Width        0.30202    0.05804   5.204 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 194.04  on 171  degrees of freedom
## AIC: 198.04
## 
## Number of Fisher Scoring iterations: 5

Grouped and ungrouped data

## import data 

ungrouped = read.csv(file="../data/KstoneMarginalUngroup.csv",header=TRUE)

head(ungrouped)

##   Trt Response
## 1   A  Success
## 2   A  Success
## 3   A  Success
## 4   A  Success
## 5   A  Success
## 6   A  Success

table(ungrouped)

##    Response
## Trt Failure Success
##   A      77     273
##   B      61     289

grouped = data.frame(Trt = c("A","B"),
                     Fail = c(77,61),
                     Succ = c(273, 289))
grouped

##   Trt Fail Succ
## 1   A   77  273
## 2   B   61  289

## build models

ungroup.M0 = glm(Response~1, family=binomial,data=ungrouped)
ungroup.M1 = glm(Response~1+Trt, family=binomial,data=ungrouped)

group.M0   = glm(cbind(Succ,Fail)~1, family=binomial,data=grouped)
group.M1   = glm(cbind(Succ,Fail)~Trt, family=binomial,data=grouped)

## model summarys

ungroup.M0.summary = summary(ungroup.M0)
ungroup.M1.summary = summary(ungroup.M1)

group.M0.summary   = summary(group.M0)
group.M1.summary   = summary(group.M1)

### check what summary contains.
names(ungroup.M1.summary)

##  [1] "call"           "terms"          "family"         "deviance"      
##  [5] "aic"            "contrasts"      "df.residual"    "null.deviance" 
##  [9] "df.null"        "iter"           "deviance.resid" "coefficients"  
## [13] "aliased"        "dispersion"     "df"             "cov.unscaled"  
## [17] "cov.scaled"

The summary does not contain log-likelihood information, but contains deviance and null deviance.

The deviance evaluates the current model; and
The null deviance evaluates the null model (intercept only model)

To get log-likelihood, we can apply logLik() function on the model. The output from logLik() is the Full Log Likelihood of the SAS output. For ungrouped data, the Full Log Likelihood and Log Likelihood functions are the same. For grouped data, the two differ by a constant. The reason is that likelihood function for the ungrouped data is built on Bernulli distribution, and the likelihood function for the ungrouped data is built on Binomial distribution. The two differ by a multiplier of Bionomial distribution. However, both share the same portion that relates to the unknown parameters. Specifically, \[ \text{Full Log Likelihood} = \text{Log Likelihood} + \log \left( {n_1 \choose y_1} \times {n_2 \choose y_2} \times \ldots \times {n_G \choose y_G}\right), \] where \(G\) is the number of groups. In our case, the constant is \[ \log \left( {(273+77) \choose 273} \times {(289+61) \choose 289} \right) = 340.486. \] To verify log-likelihood function is the same for grouped and ungrouped data, it means that log-likelihood not the full log-likelihood. Therefore, we need to adjust for that constant from R output before we make comparions.

constant.to.adjust = log(choose(n=(273+77),k=273)*choose(n=(289+61),k=289))
  
cat(paste("For ungrouped data:", "\n",
          "[1] Log-likelihood",  "\n",
               "    M0=", round(logLik(ungroup.M0),4), "\n",
               "    M1=", round(logLik(ungroup.M1),4), "\n",
          "[2] Deviance \n", 
               "    M0=", round(ungroup.M0.summary$"deviance",4), "\n",
               "    M1=", round(ungroup.M1.summary$"deviance",4), "\n" ))

## For ungrouped data: 
##  [1] Log-likelihood 
##      M0= -347.4912 
##      M1= -346.3338 
##  [2] Deviance 
##      M0= 694.9824 
##      M1= 692.6675

cat(paste("For grouped data:", "\n",
          "[1] Log-likelihood",  "\n",
               "    M0=", round(logLik(group.M0)-constant.to.adjust,4), "\n",
               "    M1=", round(logLik(group.M1)-constant.to.adjust,4), "\n",
          "[2] Deviance \n", 
               "    M0=", round(group.M0.summary$"deviance",4), "\n",
               "    M1=", round(group.M1.summary$"deviance",4), "\n" ))

## For grouped data: 
##  [1] Log-likelihood 
##      M0= -347.4912 
##      M1= -346.3338 
##  [2] Deviance 
##      M0= 2.3148 
##      M1= 0