Chapter 7 Splines
See final subsection on comparing splines for best way to plot this kind of data.
7.1 Step Functions
With Boston
data, following on from polynomial regression section.
To use a step function model, use cut(x, n)
function which automatically assigns break points. Note that n
specifies the number of intervals, not the number of break points (cuts).
= lm(y ~ cut(x, 3)) step3
Then following generic analysis for plotting:
= predict(step3, newdata = list(x = sort(x)), se = TRUE)
pred3
= cbind(pred3$fit + 2*pred3$se.fit, pred3$fit-2*pred3$se.fit)
se.bands3
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "2 cutpoints", bty = 'l')
lines(sort(x), pred3$fit, lwd = 2, col = "darkviolet")
matlines(sort(x), se.bands3, lwd = 1.4, col = "darkviolet", lty = 3)
We plot below as if we did it for 2,3,4 and 5 intervals:
7.2 Regression Splines
Using Boston
data again with x
and y
defined as variables for convenience with labels y.lab
and x.lab
done as well.
For regression splines, we use the splines
package.
library(splines)
Cuts for regression splines can be specified or done automatically. One option is to specify them using quartiles: cuts = summary(x)[c(2, 3, 5)]
.
Since we use data points \(x\) values as points to plot the splines, we must also sort them: sort.x = sort(x)
.
Regression Spline Model and SE Bands
We use bs()
function from splines
and can either specify the knots or specify the degrees of freedom:
= lm(y ~ bs(x, degree = 1, knots = cuts))
spline1 = predict(spline1, newdata = list(x = sort.x), se = TRUE)
pred1 = cbind(pred1$fit + 2 * pred1$se.fit, pred1$fit - 2 * pred1$se.fit) se.bands1
= lm(y ~ bs(x, degree = 1, df = 5))
spline1df = predict(spline1df, newdata = list(x = sort.x), se = TRUE)
pred1df = cbind( pred1df$fit + 2 * pred1df$se.fit,
se.bands1df $fit - 2 * pred1df$se.fit ) pred1df
Plot
par(mfrow = c(1, 2))
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Linear Spline (with knots)", bty = 'l')
lines(sort.x, pred1$fit, lwd = 2, col = "red")
matlines(sort.x, se.bands1, lwd = 2, col = "red", lty = 3)
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Linear Spline (with df)", bty = 'l')
lines(sort.x, pred1df$fit, lwd = 2, col = "darkred")
matlines(sort.x, se.bands1df, lwd = 2, col = "red", lty = 3)
For Higher Degrees
= lm(y ~ bs(x, degree = 2, df = 6))
spline2 = predict(spline2, newdata = list(x = sort.x), se = TRUE)
pred2 = cbind(pred2$fit + 2 * pred2$se.fit, pred2$fit - 2 * pred2$se.fit)
se.bands2
= lm(y ~ bs(x, degree = 3, df = 7))
spline3 = predict(spline3, newdata = list(x = sort.x), se = TRUE)
pred3 = cbind(pred3$fit + 2 * pred3$se.fit, pred3$fit - 2 * pred3$se.fit)
se.bands3
par(mfrow = c(1,3))
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Linear Spline", bty = 'l')
lines(sort.x, pred1$fit, lwd = 2, col = "darkred")
matlines(sort.x, se.bands1, lwd = 2, col = "darkred", lty = 3)
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Quadratic Spline", bty = 'l')
lines(sort.x, pred2$fit, lwd = 2, col = "darkgreen")
matlines(sort.x, se.bands2, lwd = 2, col = "darkgreen", lty = 3)
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Cubic Spline", bty = 'l')
lines(sort.x, pred3$fit, lwd = 2, col = "darkblue")
matlines(sort.x, se.bands3, lwd = 2, col = "darkblue", lty = 3)
7.3 Natural Splines
Adds the extra constraints of continuity in first \(d-1\) derivatives. We use the ns()
function. Again, we use library("splines")
package.
= lm(y ~ ns(x, df = 3))
spline.ns3 = predict(spline.ns3, newdata = list(x = sort.x), se = TRUE)
pred.ns3 = cbind(pred.ns3$fit + 2 * pred.ns3$se.fit,
se.bands.ns3 $fit - 2 * pred.ns3$se.fit) pred.ns3
Plotting
Assuming we have made similarly spline.ns1
, spline.ns2
, spline.ns3
and spline.ns4
.
par(mfrow = c(2, 2))
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Natural Spline (1 df)", bty = 'l')
lines(sort.x, pred.ns1$fit, lwd = 2, col = "darkred")
matlines(sort.x, se.bands.ns1, lwd = 2, col = "darkred", lty = 3)
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Natural Spline (2 df)", bty = 'l')
lines(sort.x, pred.ns2$fit, lwd = 2, col = "darkgreen")
matlines(sort.x, se.bands.ns2, lwd = 2, col = "darkgreen", lty = 3)
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Natural Spline (3 df)", bty = 'l')
lines(sort.x, pred.ns3$fit, lwd = 2, col = "darkblue")
matlines(sort.x, se.bands.ns3, lwd = 2, col = "darkblue", lty = 3)
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Natural Spline (4 df)", bty = 'l')
lines(sort.x, pred.ns4$fit, lwd = 2, col = "brown")
matlines(sort.x, se.bands.ns4, lwd = 2, col = "brown", lty = 3)
Practical 4 Example (Using Knots)
= lm(y ~ ns(x, knots = cuts))
ns = predict(ns, newdata = list(x = grid), se = TRUE)
pred.ns = cbind(pred.ns$fit + 2 * pred.ns$se.fit,
se.bands.ns $fit - 2 * pred.ns$se.fit) pred.ns
7.4 Smoothing Splines
For smoothing splines we use smoothing.splines()
instead of lm()
. Use the df
parameter to specify effective degrees of freedom or the cv = TRUE
parameter to use (ordinary leave-one-out) cross validation instead (to choose a \(\lambda\)). Can also directly specify lambda
or spar
(a smoothing parameter).
Notes: Specifying \(\lambda\) with effective degrees of freedom (df = ...
) still uses cross-validation.
= smooth.spline(x, y, df = 3) #choose lambda using effective degrees of freedom
smooth1 = smooth.spline(x, y, cv = TRUE) #choose lambda via cross validation
smooth2
par(mfrow = c(1,2))
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Smoothing Spline (3 df)", bty = 'l')
lines(smooth1, lwd = 2, col = "brown")
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
main = "Smoothing Spline (CV)", bty = 'l')
lines(smooth2, lwd = 2, col = "darkorange")
7.5 Comparing Splines
Below compares different splines on wage data. We load packages library("ISLR")
for data and library("splines")
to use regression and natural splines.
Setup for plotting curves
= range(Wage$age)
agelims = seq(from=agelims[1], to=agelims[2]) age.grid
Taking a sample of data of size \(n\)
set.seed(1)
# Number of data points - change this to investigate small (50),
# medium (200) and large (1000) samples.
<- 200
n
= sample(1:3000, n)
ind = Wage[ind,] # Label subset of data as Wage1. Wage1
Cubic Spline
= lm(wage~bs(age, degree = 3, knots = c(30,40,60)), data = Wage1)
fitbs = predict(fitbs, newdata = list(age = age.grid), se = T) predbs
Natural Spline
= lm(wage~ns(age, knots = c(30,40,60)), data = Wage1)
fitns = predict(fitns, newdata = list(age = age.grid), se = T) predns
## Warning in if (se.fit) list(fit = predictor, se.fit = se, df = df,
## residual.scale = sqrt(res.var)) else predictor: the condition has length > 1 and
## only the first element will be used
Smoothing Spline
= smooth.spline(Wage1$age, Wage1$wage, cv = TRUE) fitss
Generate the Plots
par(mfrow=c(1,3))
# Cubic Spline
plot(Wage1$age, Wage1$wage, col = "darkgray", pch = 19,
main = 'Cubic spline', bty = 'l',
xlab = 'age', ylab = 'wage', cex.lab = 1.4)
lines(age.grid, predbs$fit, lwd = 2, col = 'darkgreen')
abline(v = c(30,40,60), lty = 'dashed')
# Natural Spline
plot(Wage1$age, Wage1$wage, col = "darkgray", pch = 19,
main = 'Natural cubic spline', bty = 'l',
xlab = 'age', ylab = 'wage', cex.lab = 1.4)
lines(age.grid, predns$fit, lwd = 2, col = 'darkviolet')
abline(v = c(30,40,60), lty = 'dashed')
# Smoothing Spline
plot(Wage1$age, Wage1$wage, col = "darkgray", pch = 19,
main = 'Smoothing spline', bty = 'l',
xlab = 'age', ylab = 'wage', cex.lab = 1.4)
lines(fitss, lwd = 2, col = 'brown')
Practical 4 Example
Modelling medv ~ indus
using natural and smoothing splines.
Basic Setup:
library("MASS")
library(splines)
data("Boston")
= Boston$medv
y = Boston$indus
x = "Median House Value ($1000s)"
y.lab = "Proportion of non-retail business acres per town" x.lab
<- summary(x)[c(2,3,5)] #For knots
cuts = seq(min(x), max(x), length.out = 500) #For plotting curves grid
Cubic Spline:
= lm(y ~ bs(x, knots = cuts))
bs = predict(bs, newdata = list(x = grid), se = TRUE)
pred.bs = cbind(pred.bs$fit + 2 * pred.bs$se.fit,
se.bands.bs $fit - 2 * pred.bs$se.fit) pred.bs
Natural Cubic Spline (Knots):
= lm(y ~ ns(x, knots = cuts))
nsk = predict(nsk, newdata = list(x = grid), se = TRUE)
pred.nsk = cbind(pred.nsk$fit + 2 * pred.nsk$se.fit,
se.bands.nsk $fit - 2 * pred.nsk$se.fit) pred.nsk
Natural Cubic Spline (2 df):
= lm(y ~ ns(x, df = 2))
ns2 = predict(ns2, newdata = list(x = grid), se = TRUE)
pred.ns2 = cbind(pred.ns2$fit + 2 * pred.ns2$se.fit,
se.bands.ns2 $fit - 2 * pred.ns2$se.fit) pred.ns2
Natural Cubic Spline (3 df):
= lm(y ~ ns(x, df = 3))
ns3 = predict(ns3, newdata = list(x = grid), se = TRUE)
pred.ns3 = cbind(pred.ns3$fit + 2 * pred.ns3$se.fit,
se.bands.ns3 $fit - 2 * pred.ns3$se.fit) pred.ns3
Smoothing Spline (3 effective df):
= smooth.spline(x, y, df = 3) smooth
Plot:
par(mfrow = c(2,3))
plot(x, y, col="darkgrey", xlab = x.lab, ylab = y.lab, main = "Cubic Spline", bty = 'l', pch = 16)
lines(grid, pred.bs$fit, lwd = 2, col = 1)
matlines(grid, se.bands.bs, lwd = 2, col = 1, lty = 3)
plot(x, y, col="darkgrey", xlab = x.lab, ylab = y.lab, main = "Natural Cubic Spline", bty = 'l', pch = 16)
lines(grid, pred.nsk$fit, lwd = 2, col = 2)
matlines(grid, se.bands.nsk, lwd = 2, col = 2, lty = 3)
plot(x, y, col="darkgrey", xlab = x.lab, ylab = y.lab, main = "Natural Spline (2 df)", bty = 'l', pch = 16)
lines(grid, pred.ns2$fit, lwd = 2, col = 3)
matlines(grid, se.bands.ns2, lwd = 2, col = 3, lty = 3)
plot(x, y, col="darkgrey", xlab = x.lab, ylab = y.lab, main = "Natural Spline (3 df)", bty = 'l', pch = 16)
lines(grid, pred.ns3$fit, lwd = 2, col = 4)
matlines(grid, se.bands.ns3, lwd = 2, col = 4, lty = 3)
plot(x, y, col="darkgrey", xlab = x.lab, ylab = y.lab, main = "Smoothing Spline (3 Effective df)", bty = 'l', pch = 16)
lines(smooth, lwd = 2, col = 6)