# Read in the data. # # The data are the deaths for males in the civilian population of England & # Wales in the calendar year 2002. The population estimates are the central # exposed to risk at 1st July 2002. data<-read.csv(file="c:\\la\\blackhall\\website\\lectures\\EW_males_2002.csv", header=TRUE) # The object data contains three columns: one for age, one for death counts # and one for population. Here we take a look at the data simply by typing # the name of the object and hitting return: data # Since the population estimates are central exposed to risk, we are # modelling the force of mortality, mu, with Poisson death counts. # We can add to our data object by calculating crude values of mu: data$CrudeMu<-data$Deaths/data$Population # Take a look again data # It is always a good idea to plot the data plot(x=data$Age, y=data$CrudeMu, pch=20) # The values of Mu look exponential, so we should try a transformation. # The canonical link for the Poisson model is the natural logarithm. plot(x=data$Age, y=log(data$CrudeMu), pch=20) # This looks passably linear, so fit a generalised linear model. model<-glm(data$Deaths ~ data$Age, offset=log(data$Population), family=poisson()) # Take a look at the fitted model against the data on a log scale # You can access the fitted death counts with model$fitted plot(x=data$Age, y=log(data$CrudeMu), pch=20) lines(x=data$Age, y=log(model$fitted/data$Population)) # Calculate Pearson residuals... pearson<-(data$Deaths-model$fitted)/sqrt(model$fitted) # ...and the Chi-squared test statistic X<-sum(pearson*pearson) X # How many degrees of freedom should we use to check this? # There are n=40 observations and p=2 parameters, so df=38 # Remember that we have estimated two parameters: one being the # mean and one being the coefficient of age. # The standard tables give the upper 5% point of the Chi-squared # distribution on 38 degrees of freedom as 73.55. Alternatively, # you can get R to calculate this: qchisq(0.05, 38, lower.tail=FALSE) # Alternatively, you can get R to calculate a p-level for the # test statistic directly: pchisq(X, 38, lower.tail=FALSE) # which is clearly a lot lower than 5%, or even 0.1%. suggesting # that this graduation is not good enough. We can apply our # tests to see why, but the fastest way is to plot the residuals plot(x=data$Age, y=pearson, pch=20) # The most obvious problem is the outlier at -8. The residuals # should be approximately N(0,1)-distributed and -8 is very extreme. # We can take a look at the contribution towards the Chi-squared # statistic by age as follows: cbind(data$Age,pearson*pearson) # The main problem lies at age 83, but with other problem ages at 76 and 50 # Cumulative deviations test for Poisson model CDT<- sum(data$Deaths-model$fitted) CDT.var<- sum(model$fitted) Z<-CDT / sqrt(CDT.var) Z # The CDT statistic is practically zero. Why? # Lag 1 serial correlation test r<-cor(pearson[1:39], pearson[2:40]) rvariance<-1/(40-1) Z<-r/sqrt(rvariance) Z # Z should be approximately N(0,1), and this exceeds the upper 5% point # of 1.64. Alternatively, you can get R to calculate a p-level: pnorm(Z, lower.tail=FALSE) # This is less than 1%, so the serial correlation test is significant at # this level, i.e. the residuals are correlated and not random. # We can also conduct a formal test for bias. Before we do, we can just # use our eyes by plotting a histogram of the residuals: hist(pearson) # We can make life easier for ourselves by using the stem command to split # the residuals up by range: stem(pearson, scale=2) # We have 18 positive residuals out of 40, so we don't need to conduct # the sign test for bias: 18 positive residuals against 20 expected is # nothing out of the ordinary. # # We can calculate the standardised deviations test statistic as follows: lowerbreakpoint<-c(-9, -3, -2, -1, 0, 1, 2, 3) upperbreakpoint<-c(-3, -2, -1, 0, 1, 2, 3, 9) expectedpearson<-40*(pnorm(upperbreakpoint) - pnorm(lowerbreakpoint)) actual<-c(1, 3, 6, 12, 8, 3, 3, 4) SDT<-sum((actual-expectedpearson)^2/expectedpearson) SDT # We compare this against the Chi-squared distribution. # How many degrees of freedom? # 8 ranges # less 1 constraint # i.e. 7 df qchisq(0.05, 7, lower.tail=FALSE) # Alternatively, we can get R to calculate a p-level for the # test statistic directly: pchisq(SDT, 7, lower.tail=FALSE)