top of page

Comparison of PCA, Lasso and Ridge Regression

Writer's picture: Karl 曹Karl 曹

In this post, I will compare the performance of PCA, Lasso, and Ridge regression at a relatively high dimensional DGP.

Number of observations n = 100

Number of dependent variables p = 10

Error term e ~ N(0, 1)

Dependent variables follow multinormal distribution, vector of means mu is

mu = (0, 10, 9, 8, 7, 6, 5, 4, 3, 2), and the variance-covariance matrix is:

The true beta parameters are randomly sampled from the interval [0, 0.1], and y = X beta + e


The benchmark case is shown below, we can see that Lasso performs relatively better than all other three, and naive OLS performs the worst.

1 Non-zero large beta

Now that we have our first simulation case, I changed the beta value, and now all beta are sampled from the interval [2, 3].

Not surprisingly, Lasso performs the worst, because Lasso always makes some parameters be 0, in such high dimensional setting.

Meanwhile, when we look closer to the other three, we cannot find any significant difference.


2 Highly correlated var-covariance matrix

Here we only change our sigma matrix, let the diagnoal be 1, and all covariance be 0.999999.

Not surprisingly, all three machine learning methods outperform OLS, and we can see that the MSE of PCR has a larger variance.




Code

T<-100
p2<-10
I<-diag(1,p2,p2)
beta.hat.ols<-matrix(NA,nrow=T,ncol=p2)
beta.hat.ridge<-matrix(NA,nrow=T,ncol=p2)
mse.ols<-c()
mse.ri<-c()
mse.lasso<-c()
mse.pcr<-c()
y.pre<-matrix(NA,nrow=T,ncol=n)
y.pre.ri<-matrix(NA,nrow=T,ncol=n)
y.pre.lasso<-matrix(NA,nrow=T,ncol=n)
y.pre.pcr<-matrix(NA,nrow=T,ncol=n)
##for the sake of convenience,we design another dgp 
mu<-c(0,10,9,8,7,6,5,4,3,2)
sigma<-matrix(c(2,rep(0.01,10),2,rep(0.01,10),3,rep(0.01,10),3,
                  rep(0.01,10),1,rep(0.01,10),1,rep(0.01,10),2,rep(0.01,10),
                  1,rep(0.01,10),2,rep(0.01,10),1),nrow=p2,ncol=p2,byrow = TRUE)
sigma
beta.true<-sample(beta_range,10)
beta.true
e.mu<-0
e.sigma<-1
dgp2 = function(n, beta.true, mu, sigma){
  X<-mvrnorm(n,mu,sigma)
  error<-rnorm(n,e.mu,e.sigma)
  y <- X%*% beta.true + error
  data<-data.frame(y,X)
  return(data)
}
grid <- 10^seq(-1,3,length=100)
###the begin to simulation
set.seed(527)
for (i in 1:T){
  train.data<-dgp2(n,beta.true,mu,sigma)
  test.data<-dgp2(n,beta.true,mu,sigma)
  x.train<-cbind(train.data[,2],train.data[,3],train.data[,4],train.data[,5],train.data[,6],train.data[,7],train.data[,8],train.data[,9],train.data[,10],train.data[,11])
  x.test<-cbind(test.data[,2],test.data[,3],test.data[,4],test.data[,5],test.data[,6],test.data[,7],test.data[,8],test.data[,9],test.data[,10],test.data[,11])
  y.train<-train.data$y
  y.test<-test.data$y
  beta.hat.ols[i,]<-solve(t(x.train)%*%x.train)%*%t(x.train)%*%y.train
  y.pre[i,]<-x.test%*%(beta.hat.ols[i,])
  mse.ols[i]<-mean((y.pre[i,]-y.test)^2)
  ###for ridge regression
  cv.out1<-cv.glmnet(x.train,y.train,lambda = grid,alpha=0,intercept=FALSE)
  best_lambda1<-cv.out1$lambda.min
  beta.hat.ridge[i,]<-solve((t(x.train)%*%x.train)+best_lambda1*I)%*%t(x.train)%*%y.train
  y.pre.ri[i,]<-x.test%*%beta.hat.ridge[i,]
  mse.ri[i]<-mean((y.pre.ri[i,]-y.test)^2)
  ###for lasso
  lasso.mod<-glmnet(x.train,y.train, alpha=1, lambda = grid,intercept=F)
  cv.out<-cv.glmnet(x.train, y.train,alpha=1,intercept=F)
  bestlambda<-cv.out$lambda.min
  y.pre.lasso[i,]<-predict(lasso.mod, s=bestlambda, newx = x.test)
  mse.lasso[i]<-mean((y.pre.lasso[i,]-y.test)^2)
  ####for PCR 
  pcr.fit<-pcr(y.train~x.train,data=train.data,scale=TRUE,validation="CV")
  cverr <- RMSEP(pcr.fit)$val[1,,-1]
  imin<-which.min(cverr)
  y.pre.pcr[i,]<-predict(pcr.fit,x.test,ncomp = imin)
  mse.pcr[i]<-mean((y.pre.pcr[i,]-y.test)^2)
}
data.box<-cbind(mse.ols,mse.ri,mse.lasso,mse.pcr)
boxplot(data.box, names=c("OLS","Ridge","Lasso","PCR"),
        main="MSE",lwd=2)
abline(h=1,col="darkred",lwd=3)##### variance error term
######################################b
######first one :change beta
beta_range1<-seq(2,3,by=0.01)
beta.true1<-sample(beta_range1,10)
for (i in 1:T){
  train.data<-dgp2(n,beta.true1,mu,sigma)
  test.data<-dgp2(n,beta.true1,mu,sigma)
  x.train<-cbind(train.data[,2],train.data[,3],train.data[,4],train.data[,5],train.data[,6],train.data[,7],train.data[,8],train.data[,9],train.data[,10],train.data[,11])
  x.test<-cbind(test.data[,2],test.data[,3],test.data[,4],test.data[,5],test.data[,6],test.data[,7],test.data[,8],test.data[,9],test.data[,10],test.data[,11])
  y.train<-train.data$y
  y.test<-test.data$y
  beta.hat.ols[i,]<-solve(t(x.train)%*%x.train)%*%t(x.train)%*%y.train
  y.pre[i,]<-x.test%*%(beta.hat.ols[i,])
  mse.ols[i]<-mean((y.pre[i,]-y.test)^2)
  ###for ridge regression
  cv.out1<-cv.glmnet(x.train,y.train,lambda = grid,alpha=0,intercept=FALSE)
  best_lambda1<-cv.out1$lambda.min
  beta.hat.ridge[i,]<-solve((t(x.train)%*%x.train)+best_lambda1*I)%*%t(x.train)%*%y.train
  y.pre.ri[i,]<-x.test%*%beta.hat.ridge[i,]
  mse.ri[i]<-mean((y.pre.ri[i,]-y.test)^2)
  ###for lasso
  lasso.mod<-glmnet(x.train,y.train, alpha=1, lambda = grid,intercept=F)
  cv.out<-cv.glmnet(x.train, y.train,alpha=1,intercept=F)
  bestlambda<-cv.out$lambda.min
  y.pre.lasso[i,]<-predict(lasso.mod, s=bestlambda, newx = x.test)
  mse.lasso[i]<-mean((y.pre.lasso[i,]-y.test)^2)
  ####for PCR 
  pcr.fit<-pcr(y.train~x.train,data=train.data,scale=TRUE,validation="CV")
  cverr <- RMSEP(pcr.fit)$val[1,,-1]
  imin <- which.min(cverr)
  y.pre.pcr[i,]<-predict(pcr.fit,x.test,ncomp = imin)
  mse.pcr[i]<-mean((y.pre.pcr[i,]-y.test)^2)
}
data.box2<-cbind(mse.ols,mse.ri,mse.lasso,mse.pcr)
boxplot(data.box2, names=c("OLS","Ridge","Lasso","PCR"),
        main="MSE",lwd=2)
boxplot(data.box2[,-3], names=c("Ridge","Lasso","PCR"),
        main="MSE",lwd=2)
abline(h=1,col="darkred",lwd=3)##### variance error term
#########second way:change sigma
####construct a function to define the sigma matrix
si<-function(s){
  for (i in 1:10){
    sm[i,i]<-s[i]
  }
  for (i in 1:10){
    for (j in 1:10){
      if(j!=i){
        sm[i,j]<-sqrt(sm[i,i]*sm[j,j])*0.999999
        sm[j,i]<-sm[i,j]
      }
    }
  }
  return(sm)
}
sm<-matrix(NA,10,10)
s<-c(1,1,1,1,1,1,1,1,1,1)
si(s)
sigma<-si(s)
###simulation
for (i in 1:T){
  train.data<-dgp2(n,beta.true,mu,sigma)
  test.data<-dgp2(n,beta.true,mu,sigma)
  x.train<-cbind(train.data[,2],train.data[,3],train.data[,4],train.data[,5],train.data[,6],train.data[,7],train.data[,8],train.data[,9],train.data[,10],train.data[,11])
  x.test<-cbind(test.data[,2],test.data[,3],test.data[,4],test.data[,5],test.data[,6],test.data[,7],test.data[,8],test.data[,9],test.data[,10],test.data[,11])
  y.train<-train.data$y
  y.test<-test.data$y
  beta.hat.ols[i,]<-solve(t(x.train)%*%x.train)%*%t(x.train)%*%y.train
  y.pre[i,]<-x.test%*%(beta.hat.ols[i,])
  mse.ols[i]<-mean((y.pre[i,]-y.test)^2)
  ###for ridge regression
  cv.out1<-cv.glmnet(x.train,y.train,lambda = grid,alpha=0,intercept=FALSE)
  best_lambda1<-cv.out1$lambda.min
  beta.hat.ridge[i,]<-solve((t(x.train)%*%x.train)+best_lambda1*I)%*%t(x.train)%*%y.train
  y.pre.ri[i,]<-x.test%*%beta.hat.ridge[i,]
  mse.ri[i]<-mean((y.pre.ri[i,]-y.test)^2)
  ###for lasso
  lasso.mod<-glmnet(x.train,y.train, alpha=1, lambda = grid,intercept=F)
  cv.out<-cv.glmnet(x.train, y.train,alpha=1,intercept=F)
  bestlambda<-cv.out$lambda.min
  y.pre.lasso[i,]<-predict(lasso.mod, s=bestlambda, newx = x.test)
  mse.lasso[i]<-mean((y.pre.lasso[i,]-y.test)^2)
  ####for PCR 
  pcr.fit<-pcr(y.train~x.train,data=train.data,scale=TRUE,validation="CV")
  cverr <- RMSEP(pcr.fit)$val[1,,-1]
  imin <- which.min(cverr)
  y.pre.pcr[i,]<-predict(pcr.fit,x.test,ncomp = imin)
  mse.pcr[i]<-mean((y.pre.pcr[i,]-y.test)^2)
}
data.box4<-cbind(mse.ols,mse.ri,mse.lasso,mse.pcr)
boxplot(data.box4, names=c("OLS","Ridge","Lasso","PCR"),
        main="MSE", lwd=2)
abline(h=1,col="darkred",lwd=3)

5 views0 comments

Recent Posts

See All

Comments


bottom of page