In this post, I will compare the performance of PCA, Lasso, and Ridge regression at a relatively high dimensional DGP.
Number of observations n = 100
Number of dependent variables p = 10
Error term e ~ N(0, 1)
Dependent variables follow multinormal distribution, vector of means mu is
mu = (0, 10, 9, 8, 7, 6, 5, 4, 3, 2), and the variance-covariance matrix is:
The true beta parameters are randomly sampled from the interval [0, 0.1], and y = X beta + e
The benchmark case is shown below, we can see that Lasso performs relatively better than all other three, and naive OLS performs the worst.
1 Non-zero large beta
Now that we have our first simulation case, I changed the beta value, and now all beta are sampled from the interval [2, 3].
Not surprisingly, Lasso performs the worst, because Lasso always makes some parameters be 0, in such high dimensional setting.
Meanwhile, when we look closer to the other three, we cannot find any significant difference.
2 Highly correlated var-covariance matrix
Here we only change our sigma matrix, let the diagnoal be 1, and all covariance be 0.999999.
Not surprisingly, all three machine learning methods outperform OLS, and we can see that the MSE of PCR has a larger variance.
Code
T<-100
p2<-10
I<-diag(1,p2,p2)
beta.hat.ols<-matrix(NA,nrow=T,ncol=p2)
beta.hat.ridge<-matrix(NA,nrow=T,ncol=p2)
mse.ols<-c()
mse.ri<-c()
mse.lasso<-c()
mse.pcr<-c()
y.pre<-matrix(NA,nrow=T,ncol=n)
y.pre.ri<-matrix(NA,nrow=T,ncol=n)
y.pre.lasso<-matrix(NA,nrow=T,ncol=n)
y.pre.pcr<-matrix(NA,nrow=T,ncol=n)
##for the sake of convenience,we design another dgp
mu<-c(0,10,9,8,7,6,5,4,3,2)
sigma<-matrix(c(2,rep(0.01,10),2,rep(0.01,10),3,rep(0.01,10),3,
rep(0.01,10),1,rep(0.01,10),1,rep(0.01,10),2,rep(0.01,10),
1,rep(0.01,10),2,rep(0.01,10),1),nrow=p2,ncol=p2,byrow = TRUE)
sigma
beta.true<-sample(beta_range,10)
beta.true
e.mu<-0
e.sigma<-1
dgp2 = function(n, beta.true, mu, sigma){
X<-mvrnorm(n,mu,sigma)
error<-rnorm(n,e.mu,e.sigma)
y <- X%*% beta.true + error
data<-data.frame(y,X)
return(data)
}
grid <- 10^seq(-1,3,length=100)
###the begin to simulation
set.seed(527)
for (i in 1:T){
train.data<-dgp2(n,beta.true,mu,sigma)
test.data<-dgp2(n,beta.true,mu,sigma)
x.train<-cbind(train.data[,2],train.data[,3],train.data[,4],train.data[,5],train.data[,6],train.data[,7],train.data[,8],train.data[,9],train.data[,10],train.data[,11])
x.test<-cbind(test.data[,2],test.data[,3],test.data[,4],test.data[,5],test.data[,6],test.data[,7],test.data[,8],test.data[,9],test.data[,10],test.data[,11])
y.train<-train.data$y
y.test<-test.data$y
beta.hat.ols[i,]<-solve(t(x.train)%*%x.train)%*%t(x.train)%*%y.train
y.pre[i,]<-x.test%*%(beta.hat.ols[i,])
mse.ols[i]<-mean((y.pre[i,]-y.test)^2)
###for ridge regression
cv.out1<-cv.glmnet(x.train,y.train,lambda = grid,alpha=0,intercept=FALSE)
best_lambda1<-cv.out1$lambda.min
beta.hat.ridge[i,]<-solve((t(x.train)%*%x.train)+best_lambda1*I)%*%t(x.train)%*%y.train
y.pre.ri[i,]<-x.test%*%beta.hat.ridge[i,]
mse.ri[i]<-mean((y.pre.ri[i,]-y.test)^2)
###for lasso
lasso.mod<-glmnet(x.train,y.train, alpha=1, lambda = grid,intercept=F)
cv.out<-cv.glmnet(x.train, y.train,alpha=1,intercept=F)
bestlambda<-cv.out$lambda.min
y.pre.lasso[i,]<-predict(lasso.mod, s=bestlambda, newx = x.test)
mse.lasso[i]<-mean((y.pre.lasso[i,]-y.test)^2)
####for PCR
pcr.fit<-pcr(y.train~x.train,data=train.data,scale=TRUE,validation="CV")
cverr <- RMSEP(pcr.fit)$val[1,,-1]
imin<-which.min(cverr)
y.pre.pcr[i,]<-predict(pcr.fit,x.test,ncomp = imin)
mse.pcr[i]<-mean((y.pre.pcr[i,]-y.test)^2)
}
data.box<-cbind(mse.ols,mse.ri,mse.lasso,mse.pcr)
boxplot(data.box, names=c("OLS","Ridge","Lasso","PCR"),
main="MSE",lwd=2)
abline(h=1,col="darkred",lwd=3)##### variance error term
######################################b
######first one :change beta
beta_range1<-seq(2,3,by=0.01)
beta.true1<-sample(beta_range1,10)
for (i in 1:T){
train.data<-dgp2(n,beta.true1,mu,sigma)
test.data<-dgp2(n,beta.true1,mu,sigma)
x.train<-cbind(train.data[,2],train.data[,3],train.data[,4],train.data[,5],train.data[,6],train.data[,7],train.data[,8],train.data[,9],train.data[,10],train.data[,11])
x.test<-cbind(test.data[,2],test.data[,3],test.data[,4],test.data[,5],test.data[,6],test.data[,7],test.data[,8],test.data[,9],test.data[,10],test.data[,11])
y.train<-train.data$y
y.test<-test.data$y
beta.hat.ols[i,]<-solve(t(x.train)%*%x.train)%*%t(x.train)%*%y.train
y.pre[i,]<-x.test%*%(beta.hat.ols[i,])
mse.ols[i]<-mean((y.pre[i,]-y.test)^2)
###for ridge regression
cv.out1<-cv.glmnet(x.train,y.train,lambda = grid,alpha=0,intercept=FALSE)
best_lambda1<-cv.out1$lambda.min
beta.hat.ridge[i,]<-solve((t(x.train)%*%x.train)+best_lambda1*I)%*%t(x.train)%*%y.train
y.pre.ri[i,]<-x.test%*%beta.hat.ridge[i,]
mse.ri[i]<-mean((y.pre.ri[i,]-y.test)^2)
###for lasso
lasso.mod<-glmnet(x.train,y.train, alpha=1, lambda = grid,intercept=F)
cv.out<-cv.glmnet(x.train, y.train,alpha=1,intercept=F)
bestlambda<-cv.out$lambda.min
y.pre.lasso[i,]<-predict(lasso.mod, s=bestlambda, newx = x.test)
mse.lasso[i]<-mean((y.pre.lasso[i,]-y.test)^2)
####for PCR
pcr.fit<-pcr(y.train~x.train,data=train.data,scale=TRUE,validation="CV")
cverr <- RMSEP(pcr.fit)$val[1,,-1]
imin <- which.min(cverr)
y.pre.pcr[i,]<-predict(pcr.fit,x.test,ncomp = imin)
mse.pcr[i]<-mean((y.pre.pcr[i,]-y.test)^2)
}
data.box2<-cbind(mse.ols,mse.ri,mse.lasso,mse.pcr)
boxplot(data.box2, names=c("OLS","Ridge","Lasso","PCR"),
main="MSE",lwd=2)
boxplot(data.box2[,-3], names=c("Ridge","Lasso","PCR"),
main="MSE",lwd=2)
abline(h=1,col="darkred",lwd=3)##### variance error term
#########second way:change sigma
####construct a function to define the sigma matrix
si<-function(s){
for (i in 1:10){
sm[i,i]<-s[i]
}
for (i in 1:10){
for (j in 1:10){
if(j!=i){
sm[i,j]<-sqrt(sm[i,i]*sm[j,j])*0.999999
sm[j,i]<-sm[i,j]
}
}
}
return(sm)
}
sm<-matrix(NA,10,10)
s<-c(1,1,1,1,1,1,1,1,1,1)
si(s)
sigma<-si(s)
###simulation
for (i in 1:T){
train.data<-dgp2(n,beta.true,mu,sigma)
test.data<-dgp2(n,beta.true,mu,sigma)
x.train<-cbind(train.data[,2],train.data[,3],train.data[,4],train.data[,5],train.data[,6],train.data[,7],train.data[,8],train.data[,9],train.data[,10],train.data[,11])
x.test<-cbind(test.data[,2],test.data[,3],test.data[,4],test.data[,5],test.data[,6],test.data[,7],test.data[,8],test.data[,9],test.data[,10],test.data[,11])
y.train<-train.data$y
y.test<-test.data$y
beta.hat.ols[i,]<-solve(t(x.train)%*%x.train)%*%t(x.train)%*%y.train
y.pre[i,]<-x.test%*%(beta.hat.ols[i,])
mse.ols[i]<-mean((y.pre[i,]-y.test)^2)
###for ridge regression
cv.out1<-cv.glmnet(x.train,y.train,lambda = grid,alpha=0,intercept=FALSE)
best_lambda1<-cv.out1$lambda.min
beta.hat.ridge[i,]<-solve((t(x.train)%*%x.train)+best_lambda1*I)%*%t(x.train)%*%y.train
y.pre.ri[i,]<-x.test%*%beta.hat.ridge[i,]
mse.ri[i]<-mean((y.pre.ri[i,]-y.test)^2)
###for lasso
lasso.mod<-glmnet(x.train,y.train, alpha=1, lambda = grid,intercept=F)
cv.out<-cv.glmnet(x.train, y.train,alpha=1,intercept=F)
bestlambda<-cv.out$lambda.min
y.pre.lasso[i,]<-predict(lasso.mod, s=bestlambda, newx = x.test)
mse.lasso[i]<-mean((y.pre.lasso[i,]-y.test)^2)
####for PCR
pcr.fit<-pcr(y.train~x.train,data=train.data,scale=TRUE,validation="CV")
cverr <- RMSEP(pcr.fit)$val[1,,-1]
imin <- which.min(cverr)
y.pre.pcr[i,]<-predict(pcr.fit,x.test,ncomp = imin)
mse.pcr[i]<-mean((y.pre.pcr[i,]-y.test)^2)
}
data.box4<-cbind(mse.ols,mse.ri,mse.lasso,mse.pcr)
boxplot(data.box4, names=c("OLS","Ridge","Lasso","PCR"),
main="MSE", lwd=2)
abline(h=1,col="darkred",lwd=3)
Comments