Cox–Snell Residuals

Using the bmt data set, we compute Cox–Snell residuals and the Nelson–Aalen cumulative hazard function of the residuals. If the model is correct, the cumulative hazard function of these residuals is a line with slope 1.

data(bmt)

## code taken from: https://www.math.wustl.edu/~jmding/math434/R_model_diag.R

## recode variables according to book
bmt$z1 <- bmt$z1 - 28
bmt$z2 <- bmt$z2 - 28
bmt$z1xz2 <- bmt$z1 * bmt$z2
bmt$g1 <- as.double(bmt$group==1)
bmt$g2 <- as.double(bmt$group==2)
bmt$g3 <- as.double(bmt$group==3)
bmt$z7c = bmt$z7 / 30 - 9

## fit model and coxsnell residuals
fit1 <- coxph(Surv(t2,d3)~z1+z2+z1*z2+g2+g3+z7c+z8+z10,
              method='breslow',data=bmt)
coxsnellres <- bmt$d3-resid(fit1,type="martingale")

## Use NA method to estimate the cumulative hazard function for residuals
fit4 <- survfit(Surv(coxsnellres,bmt$d3)~1)
nacumhaz <- cumsum(fit4$n.event/fit4$n.risk)

## plot the results
par(mar=c(5,5,1,1))
plot(fit4$time,nacumhaz,type='s',col='blue')
abline(0,1,col='red',lty=2)

Application to SLC7A11 in kirc_small

Continuous variables are often cut on some value (such as the median) and analyzed as binary factors. Consider Kaplan-Meier survival curves for the SLC7A11_ex thresholded on the median from the data set kirc_small.RData. These are plotted below along with the result of the log rank test which is not significant:

load("kirc_small.RData")
names(kirc_small)
## [1] "time"          "death_obs"     "BAP1_mutation" "SLC7A11_ex"
kirc_small[,"SLC7A11_gr_median"] <- kirc_small$SLC7A11_ex > median(kirc_small$SLC7A11_ex)
fit <- survfit(Surv(time,death_obs)~SLC7A11_gr_median,data=kirc_small)
plot(fit,xlab="Days Since Diagnosis",ylab="S(t)",col=1:2,lty=1:2,lwd=2)
legend("bottomleft",names(fit$strata),col=1:2,lty=1:2,lwd=2)

## log-rank p-value
survdiff(Surv(time,death_obs)~SLC7A11_gr_median,
              data=kirc_small)
## Call:
## survdiff(formula = Surv(time, death_obs) ~ SLC7A11_gr_median, 
##     data = kirc_small)
## 
##                           N Observed Expected (O-E)^2/E (O-E)^2/V
## SLC7A11_gr_median=FALSE 189       40     49.4      1.77      3.55
## SLC7A11_gr_median=TRUE  189       59     49.6      1.76      3.55
## 
##  Chisq= 3.5  on 1 degrees of freedom, p= 0.06

A very similar analysis was done in this article.

We often have much more power treating continuous variables as continuous and using Cox PH or other models. With the Cox Proportional hazards model, the p-value is far more significant (< 0.001).

fit <- coxph(Surv(time,death_obs)~log2(SLC7A11_ex),
               data=kirc_small)
fit
## Call:
## coxph(formula = Surv(time, death_obs) ~ log2(SLC7A11_ex), data = kirc_small)
## 
##                     coef exp(coef) se(coef)    z     p
## log2(SLC7A11_ex) 0.25485   1.29027  0.07746 3.29 0.001
## 
## Likelihood ratio test=10.88  on 1 df, p=0.0009744
## n= 378, number of events= 99

Measuring Deviation in Graphical Model Fitting

We can measure the goodness-of-fit of this model using Cox-Snell residuals. Here we also simulate deviates from the expo(1) model and plot the resulting cumulative hazards as a graphical method for determining how far the observed residuals are from the true model.

fit <- coxph(Surv(time,death_obs)~log2(SLC7A11_ex),
               data=kirc_small)
coxsnellres <- kirc_small$death_obs-resid(fit,type="martingale")

## Use NA method to estimate the cumulative hazard function for residuals
fit_na <- survfit(Surv(coxsnellres,kirc_small$death_obs)~1)
nacumhaz <- cumsum(fit_na$n.event/fit_na$n.risk)


## Compute NA for deviates which actually come from expo(1) model
n <- nrow(kirc_small)
N <- 100
ti <- list()
nacumhaz0 <- list()

for(ii in 1:N){
  x <- rexp(n)
  fit_na0 <- survfit(Surv(x,rep(1,n))~1)
  ti[[ii]] <- fit_na0$time
  nacumhaz0[[ii]] <- cumsum(fit_na0$n.event/fit_na0$n.risk)
}


## plot the results
par(mar=c(5,5,1,1))
xlim <- range(fit_na$time)
ylim <- range(nacumhaz)
plot(0,0,xlim=xlim,ylim=ylim,col='0',
     xlab="Time",ylab="Cumulative Hazard")
for(ii in 1:N){
  points(ti[[ii]],nacumhaz0[[ii]],type='s',col='grey')
}
points(fit_na$time,nacumhaz,type='s',col='blue')
abline(0,1,col='red',lty=2)