#Simple Regression sqrt(9) sin(2) 2/5 c(1,2,4) mdat <- matrix(c(1,2,3, 11,12,13), nrow = 2, ncol = 3, dimnames = list(c("row1", "row2"), c("C.1", "C.2", "C.3"))) mdat y=c(70,65,90,95,110,115,120,140,155,150) x=c(80,100,120,140,160,180,200,220,240,260) model=lm(y~-1+x) mean(x) sum(x^2) summary(model) Y=c(1004,1632,852,1506,1372,1270,1269,903,1555,1260,1146,1276,1225,1321) X1=c(195,255,255,255,225,225,225,195,225,225,225,225,225,230) X2=c(4,4,4.6,4.6,4.2,4.1,4.6,4.3,4.3,4,4.7,4.3,4.72,4.3) model2=lm(Y~X1+X2) summary(model2) n=14 X=cbind(rep(1,14),X1,X2) bhat=solve(t(X)%*%X)%*%t(X)%*%Y;bhat SSR=t(bhat)%*%t(X)%*%Y-n*(mean(Y))^2;SSR SSE=t(Y)%*%Y-t(bhat)%*%t(X)%*%Y;SSE SSY=t(Y)%*%Y-n*(mean(Y))^2;SSY MSE=SSE/(n-3) C=solve(t(X)%*%X) Q1=(bhat[2])^2/C[2,2] F1=Q1/MSE;F1 USairpollution=matrix(c(46,11,24,47,11,31,110,23,65,26,9,17,17,35,56,10,28,14,14,13,30,10,10,16,29,18,9,31,14,69,10,61,94,26,28,12,29,56,29,8,36,47.6,56.8,61.5,55.0,47.1,55.2,50.6,54.0,49.7,51.5,66.2,51.9,49.0,49.9,49.1,68.9,52.3,68.4,54.5,61.0,55.6,61.6,75.5,45.7,43.5,59.4,68.3,59.3,51.5,54.6,70.3,50.4,50.0,57.8,51.0,56.7,51.1,55.9,57.3,56.6,54.0,44,46,368,652,391,35,3344,462,1007,266,641,454,104,1064,412,721,361,136,381,91,291,337,207,569,699,275,204,96,181,1692,213,347,343,197,137,453,379,775,434,125,80,116,244,497,905,463,71,3369,453,751,540,844,515,201,1513,158,1233,746,529,507,132,593,624,335,717,744,448,361,308,347,1950,582,520,179,299,176,716,531,622,757,277,80,8.8,8.9,9.1,9.6,12.4,6.5,10.4,7.1,10.9,8.6,10.9,9.0,11.2,10.1,9.0,10.8,9.7,8.8,10.0,8.2,8.3,9.2,9.0,11.8,10.6,7.9,8.4,10.6,10.9,9.6,6.0,9.4,10.6,7.6,8.7,8.7,9.4,9.5,9.3,12.7,9.0,33.36,7.77,48.34,41.31,36.11,40.75,34.44,39.04,34.99,37.01,35.94,12.95,30.85,30.96,43.37,48.19,38.74,54.47,37.00,48.52,43.11,49.10,59.80,29.07,25.94,46.00,56.77,44.68,30.18,39.93,7.05,36.22,42.75,42.59,15.17,20.66,38.79,35.89,38.89,30.58,40.25,135,58,115,111,166,148,122,132,155,134,78,86,103,129,127,103,121,116,99,100,123,105,128,123,137,119,113,116,98,115,36,147,125,115,89,67,164,105,111,82,114),41) cities = c("Albany", "Albuquerque","Atlanta","Baltimore","Buffalo","Charleston","Chicago", "Cincinnati","Cleveland","Columbus","Dallas","Denver","DesMoines","Detroit","Hartford","Houston","Indianapolis","Jacksonville","Kansas City","Little Rock","Louisville","Memphis","Miami", "Milwaukee","Minneapolis","Nashville","New Orleans","Norfolk","Omaha","Philadelphia", "Phoenix","Pittsburgh","Providence","Richmond","Salt Lake City","San Francisco","Seattle","St. Louis","Washington","Wichita","Wilmington") variables = c("SO2","temp","manu","popul","wind","precip","predays") colnames(USairpollution) = variables rownames(USairpollution)= cities d<- y<-d[,1] x1<-d[,2] x2<-d[,3] x3<-d[,4] data1=c(y,x1) m1=lm(y~x1) summary(m1) m=lm(y~-1+USairpollution[,c(-1,-7)]) summary(m) yhat<-fitted(m) e=y-yhat plot(e~yhat) plot(e~x1) plot(e~x2) plot(e~x3) qqnorm(e) par(mfrow=c(1,4)) plot(m) plot(x1,y) abline(lm(y~x1)) yhat<-fitted(m1) predict(m1) confint(m1,level=0.95) #regression diagnostics lm.influence(m1) par(mfrow=c(2,2)) plot(m1) lev = hat(model.matrix(m1)) plot(lev) data1[lev >0.2] #hat : a vector containing the diagonal of the hat matrix. #coefficients: a matrix whose i-th row contains the change in the estimated #coefficients which results when the i-th case is dropped from the regression. Note that aliased coefficients are not included in the matrix. #sigma:a vector whose i-th element contains the estimate of the residual #standard deviation obtained when the i-th case is dropped from the regression (The approximations needed for GLMs can result in this being NaN.) #wt.res: a vector of weighted residuals. r<-resid(lm(y~x1)) sr=rstandard(m1) dffits(m1) cooks.distance(m1) dfbeta(m1) covratio(m1) summary(mm) ####################Transformation in simple regression library(alr3) data(salarygov) attach(salarygov) #Figure 3.32 on page 96 m1 <- lm(MaxSalary~Score) par(mfrow=c(2,2)) plot(Score,MaxSalary) abline(m1,lty=2,col=2) StanRes1 <- rstandard(m1) absrtsr1 <- sqrt(abs(StanRes1)) plot(Score,StanRes1,ylab="Standardized Residuals") plot(Score,absrtsr1,ylab="Square Root(|Standardized Residuals|)") abline(lsfit(Score,absrtsr1),lty=2,col=2) #Output from R on page 96 library(MASS) library(car) powerTransform(cbind(MaxSalary,Score)) summary(powerTransform(cbind(MaxSalary,Score))) #Figure 3.33 on page 97 par(mfrow=c(3,2)) plot(density(MaxSalary,bw="SJ",kern="gaussian"),type="l", main="Gaussian kernel density estimate",xlab="MaxSalary") rug(MaxSalary) boxplot(MaxSalary,ylab="MaxSalary") qqnorm(MaxSalary, ylab = "MaxSalary") qqline(MaxSalary, lty = 2, col=2) plot(density(Score,bw="SJ",kern="gaussian"),type="l", main="Gaussian kernel density estimate",xlab="Score") rug(Score) boxplot(Score,ylab="Score") qqnorm(Score, ylab = "Score") qqline(Score, lty = 2, col=2) #Figure 3.34 on page 97 par(mfrow=c(1,1)) plot(sqrt(Score),log(MaxSalary),xlab=expression(sqrt(Score))) abline(lsfit(sqrt(Score),log(MaxSalary)),lty=2,col=2) #Figure 3.35 on page 98 par(mfrow=c(3,2)) plot(density(log(MaxSalary),bw="SJ",kern="gaussian"),type="l", main="Gaussian kernel density estimate",xlab="log(MaxSalary)") rug(log(MaxSalary)) boxplot(log(MaxSalary),ylab="log(MaxSalary)") qqnorm(log(MaxSalary), ylab = "log(MaxSalary)") qqline(log(MaxSalary), lty = 2, col=2) sj <- bw.SJ(sqrt(Score),lower = 0.05, upper = 100) plot(density(sqrt(Score),bw=sj,kern="gaussian"),type="l", main="Gaussian kernel density estimate",xlab=expression(sqrt(Score))) rug(sqrt(Score)) boxplot(sqrt(Score),ylab=expression(sqrt(Score))) qqnorm(sqrt(Score), ylab=expression(sqrt(Score))) qqline(sqrt(Score), lty = 2, col=2) #Figure 3.36 on page 99 m2 <- lm(log(MaxSalary)~sqrt(Score)) par(mfrow=c(1,2)) StanRes2 <- rstandard(m2) absrtsr2 <- sqrt(abs(StanRes2)) plot(sqrt(Score),StanRes2,ylab="Standardized Residuals",xlab=expression(sqrt(Score))) plot(sqrt(Score),absrtsr2,ylab="Square Root(|Standardized Residuals|)",xlab=expression(sqrt(Score))) abline(lsfit(sqrt(Score),absrtsr2),lty=2,col=2) summary(m1) summary(m2) ####################Transformation in multiple regression magazines <- read.csv("magazines.csv", header=TRUE) attach(magazines) #R output on page 177 library(alr3) summary(powerTransform(cbind(AdRevenue,AdPages,SubRevenue,NewsRevenue))) #Figure 6.21 on page 178 pairs(AdRevenue~AdPages+SubRevenue+NewsRevenue) m0=lm(AdRevenue~AdPages+SubRevenue+NewsRevenue) #Figure 6.22 on page 179 tAdPages<- log(AdPages) tSubRevenue <- log(SubRevenue) tNewsRevenue <- log(NewsRevenue) m1 <- lm(AdRevenue~log(AdPages)+log(SubRevenue)+log(NewsRevenue)) library(alr3) par(mfrow=c(1,1)) inverse.response.plot(m1,key=TRUE) #R output on page 179 library(alr3) summary(tranxy <-powerTransform(cbind(AdRevenue,AdPages,SubRevenue,NewsRevenue)) #Figure 6.23 on page 180 pairs(log(AdRevenue)~log(AdPages)+log(SubRevenue)+log(NewsRevenue)) #Figure 6.24 on page 181 m2 <- lm(log(AdRevenue)~log(AdPages)+log(SubRevenue)+log(NewsRevenue)) summary(m0) summary(m2) par(mfrow=c(2,2)) StanRes2 <- rstandard(m2) plot(log(AdPages),StanRes2,ylab="Standardized Residuals") plot(log(SubRevenue),StanRes2,ylab="Standardized Residuals") plot(log(NewsRevenue),StanRes2,ylab="Standardized Residuals") plot(m2$fitted.values,StanRes2,ylab="Standardized Residuals",xlab="Fitted Values") ##########Variable Selection in Multiple Linear Regression############ ###Ex. 1: Bridge Construction bridge <- read.table("bridge.txt", header=TRUE) attach(bridge) library(alr3) summary(tranxy <- powerTransform(cbind(Time,DArea,CCost,Dwgs,Length,Spans))) testTransform(tranxy, c(0, 0, 0, 0,0,0)) # fit linear model with transformed response: coef(tranxy, round=TRUE) summary(lm(bcPower(Time,tranxy$lam)~bcPower(DArea,tranxy$lam)+bcPower(CCost,tranxy$lam)+bcPower(Dwgs,tranxy$lam)+bcPower(Length,tranxy$lam)+bcPower(Spans,tranxy$lam))) # save the rounded transformed values, plot them with a separate # color for each highway type transformedY <- bcPower(with(bridge,cbind(Time,DArea,CCost,Dwgs,Length,Spans)), coef(tranxy, round=TRUE)) pairs(transformedY,col=as.numeric(bridge$Time)) #Figure 6.40 page 198 pairs(log(Time)~log(CCost)+log(Dwgs)+log(Length)+log(Spans)+log(DArea),data=bridge,pch="*") #Figure 6.41 page 199 m0<- lm(log(Time)~log(CCost)+log(Dwgs)+log(Length)+log(Spans)+log(DArea)) summary(m0) StanRes1 <- rstandard(m0) par(mfrow=c(2,3)) plot(log(DArea),StanRes1, ylab="Standardized Residuals") plot(log(CCost),StanRes1, ylab="Standardized Residuals") plot(log(Dwgs),StanRes1, ylab="Standardized Residuals") plot(log(Length),StanRes1, ylab="Standardized Residuals") plot(log(Spans),StanRes1, ylab="Standardized Residuals") plot(m0$fitted.values,StanRes1, ylab="Standardized Residuals",xlab="Fitted values") #Figure 6.42 page 199 par(mfrow=c(1,1)) plot(m0$fitted.values,log(Time),xlab="Fitted Values") abline(lsfit(m0$fitted.values,log(Time))) #Figure 6.43 page 200 par(mfrow=c(2,2)) plot(m0) abline(h=-2,lty=2) abline(h=2,lty=2) identify(plot(m0)) abline(v=2*6/45,lty=2) #Regression output on page 200 summary(m0) #Figure 6.44 page 201 library(alr3) mmps(m0,layout=c(2,3)) #R output on page 202 logDArea <- log(DArea) logCCost <- log(CCost) logDwgs <- log(Dwgs) logLength <- log(Length) logSpans <- log(Spans) y<-log(Time) X <- cbind(logCCost,logDwgs,logLength,logSpans,logDArea) c <- cor(X) round(c,4);kappa(t(X)%*%X) vif(m0) #Figure 6.45 on page 202 library(car) par(mfrow=c(2,3)) avPlot(m0,variable=log(CCost),ask=FALSE,id=TRUE,col.lines=palette()[1],pch="*",lwd=2) avPlot(m0,variable=log(Dwgs),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2) avPlot(m0,variable=log(Length),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2) avPlot(m0,variable=log(Spans),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2) avPlot(m0,variable=log(DArea),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2) summary(lm(y~X)) m1<-lm(log(Time)~log(CCost)+log(Dwgs)+log(Spans));summary(m1) #R output on page 203 library(car) vif(m1) syy=sum(y^2)-n*(mean(y))^2 n=45 X=cbind(log(CCost),log(Dwgs),log(Spans)) lam<-eigen(t(X)%*%X)$values;sqrt(max(lam)/min(lam)) m1 <- lm(log(Time)~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)) logDArea <- log(DArea) logCCost <- log(CCost) logDwgs <- log(Dwgs) logLength <- log(Length) logSpans <- log(Spans) X <- cbind(logDArea,logCCost,logDwgs,logLength,logSpans) install.packages("leaps") library(leaps) b <- regsubsets(as.matrix(X),log(Time)) rs <- summary(b) par(mfrow=c(1,2)) plot(1:5,rs$adjr2,xlab="Subset Size",ylab="Adjusted R-squared") library(car) subsets(b,statistic=c("adjr2")) #Table 7.1 on page 235 #Calculate adjusted R-squared rs$adjr2 rs$outmat om1 <- lm(log(Time)~log(Dwgs)) om2 <- lm(log(Time)~log(Dwgs)+log(Spans)) om3 <- lm(log(Time)~log(Dwgs)+log(Spans)+log(CCost)) om4 <- lm(log(Time)~log(Dwgs)+log(Spans)+log(CCost)+log(DArea)) om5 <- m1 #Subset size=1 n <- length(om1$residuals) npar <- length(om1$coefficients) +1 #Calculate AIC extractAIC(om1,k=2) #Calculate AICc extractAIC(om1,k=2)+2*npar*(npar+1)/(n-npar-1) #Calculate BIC extractAIC(om1,k=log(n)) #Subset size=2 npar <- length(om2$coefficients) +1 #Calculate AIC extractAIC(om2,k=2) #Calculate AICc extractAIC(om2,k=2)+2*npar*(npar+1)/(n-npar-1) #Calculate BIC extractAIC(om2,k=log(n)) #Subset size=3 npar <- length(om3$coefficients) +1 #Calculate AIC extractAIC(om3,k=2) #Calculate AICc extractAIC(om3,k=2)+2*npar*(npar+1)/(n-npar-1) #Calculate BIC extractAIC(om3,k=log(n)) #Subset size=4 npar <- length(om4$coefficients) +1 #Calculate AIC extractAIC(om4,k=2) #Calculate AICc extractAIC(om4,k=2)+2*npar*(npar+1)/(n-npar-1) #Calculate BIC extractAIC(om4,k=log(n)) #Subset size=5 npar <- length(om5$coefficients) +1 #Calculate AIC extractAIC(om5,k=2) #Calculate AICc extractAIC(om5,k=2)+2*npar*(npar+1)/(n-npar-1) #Calculate BIC extractAIC(om5,k=log(n)) #Regression output on pages 235 and 236 summary(om2) summary(om3) #Output from R on page 237 backAIC <- step(m1,direction="backward", data=bridge) backBIC <- step(m1,direction="backward", data=bridge, k=log(n)) #Output from R on page 238 mint <- lm(log(Time)~1,data=bridge) forwardAIC <- step(mint,scope=list(lower=~1, upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)), direction="forward", data=bridge) forwardBIC <- step(mint,scope=list(lower=~1, upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)), direction="forward", data=bridge,k=log(n)) stepwise <- step(mint,scope=list(lower=~1, upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)), direction="both", data=bridge) fullmodel <- lm(log(Time)~log(DArea)+log(CCost)+log(Dwgs)+log(Length) +log(Spans), data = bridge) step(fullmodel, direction = "backward", trace=TRUE ) ######Ridge Estimation################ y<-log(Time) X <- cbind(logCCost,logDwgs,logSpans) library(mctest) omcdiag(X,y) vif(om3) x=cbind(rep(1,n),log(CCost),log(Dwgs),log(Spans)) C=t(x)%*%x ###GCV=[1/n*sum(y_i-\hat y_i)^2]/[1-tr(H)/n]^2;\hat y=Hy L=function(k){ L=matrix(ncol=n,nrow=n) a=x%*%solve(C+k*diag(4))%*%t(x) return(a) } L(0) yhat=function(k){ a=L(k)%*%y return(a) } GCV1=function(kh){ k=kh[1] a=1/n*t(y-yhat(k))%*%(y-yhat(k))/(1-sum(diag(L(k)))/n)^2 return(a) } khopt=optim(c(.01),GCV1,method="L-BFGS-B",lower=c(0.00),upper=c(18.9))$par khopt; bk=function(k){ a=solve(C+k*diag(4))%*%t(x)%*%y return(a)} bk(0) GCV=function(k){ a=1/n*t(y-yhat(k))%*%(y-yhat(k))/(1-sum(diag(L(k)))/n)^2 return(a) } GCV(1) bk(khopt); syy=sum(y^2)-n*(mean(y))^2 1-GCV(1.04)/syy 1-GCV(khopt)/syy sek=c() for(i in 1:700){ sek[1]=GCV(0.00) sek[i+1]=GCV(i/10000) } tt2=(0:700)/10000 mm2=cbind(tt2,sek) m2=mm2[sort.list(mm2[,2]),] k1=m2[1,1];k1 m2[1,] mlab <- expression(k) plab <- "GCV" plot(tt2,sek,xlab= mlab,type="l",ylab = plab,pch="",cex.axis=0.6,cex.lab=1.) title((~italic(k[opt]==0.0279))) detach(bridge) ##################WAGE DATA SET###################### #Variable names in order from left to right: #EDUCATION: Number of years of education. #SOUTH: Indicator variable for Southern Region (1=Person lives in South, 0=Person lives elsewhere). #SEX: Indicator variable for sex (1=Female, 0=Male). #EXPERIENCE: Number of years of work experience. #UNION: Indicator variable for union membership (1=Union member, 0=Not union member). #WAGE: Wage (dollars per hour). #AGE: Age (years). #RACE: Race (1=Other, 2=Hispanic, 3=White). #OCCUPATION: Occupational category (1=Management,2=Sales,3=Clerical,4=Service, 5=Professional, 6=Other). #SECTOR: Sector (0=Other, 1=Manufacturing, 2=Construction). #MARR: Marital Status (0=Unmarried, 1=Married) data <- read.table("Wage.txt", header=TRUE) attach(data) wage n=534 y<-wage X<-data[,c(-6)] cor(X) library(mctest) omcdiag(X,log(y)) cbind(log(y),rep(1,534),education,south,sex,experience,union,age,race, occupation,sector,married) ################################################## fit1<-lm(log(y)~education+south+sex+experience+union+age+race+occupation+sector+married) summary(fit1) anova(fit1) r=resid(fit1) sum(r^2) plot(fit1) pred0 <- fitted(fit1,X) syy=sum(log(y)^2)-n*(mean(log(y)))^2 SSE=t(log(y)-pred0)%*%(log(y)-pred0);SSE;1-SSE/syy x=as.matrix(cbind(rep(1,n),X)) bhat=solve(t(x)%*%x)%*%t(x)%*%log(y) SSR=t(bhat)%*%t(x)%*%log(y)-n*(mean(log(y)))^2 SSR/syy;syy-SSR mlab <- "Observations" plab <- "Standardized Residuals" plot(rstandard(fit1),ylim=c(-2.5,2.75),xlab=mlab,ylab=plab) #abline(h=0) abline(h=c(-2,2),lty=3,col="darkblue",lwd=2) identify(rstandard(fit1)-.15) qqnorm(rstandard(fit1),ylab=plab) qqline(rstandard(fit1)) #-----------------------AVPLOTS #plots against explanatory variables m3=lm(log(wage)~education+south+sex+experience+union+age+race+occupation+sector+married) library(car) par(mfrow=c(1,3)) avPlot(m3,variable=education,pch="*",lwd=3,col.lines="darkblue") avPlot(m3,variable=south,pch="*",lwd=3,col.lines="darkblue") avPlot(m3,variable=sex,pch="*",lwd=3,col.lines="darkblue") #####LTS##################### library(robustbase) mRobust=ltsReg(log(y)~education+south+sex+experience+union+age+race+ occupation+sector+married) summary(mRobust) plot(mRobust, which = "rqq") out=which(mRobust$lts.wt==0);out b0=mRobust$coef yhat0=xg%*%b0 #####RLTS##################### yg=(y[-out]) xg=x[-out,] Xg=X[-out,] summary(m0<-lm(log(yg)~as.matrix(Xg))) syyg=sum(log(yg)^2)-(length(yg))*(mean(log(yg)))^2 syy=sum(log(y)^2)-(length(y))*(mean(log(y)))^2 n1=length(y[-out]) C0=t(xg)%*%xg bk=function(k){ a=solve(C0+k*diag(11))%*%t(xg)%*%yg return(a)} bk(0.1) #GCV0##################################################### L=function(k){ L=matrix(ncol=n1,nrow=n1) a=xg%*%solve(C0+k*diag(11))%*%t(xg) return(a) } yhat=function(k){ a=L(k)%*%log(yg) return(a) } GCV1=function(kh){ k=kh[1] a=1/n1*t(log(yg)-yhat(k))%*%(log(yg)-yhat(k))/(1-sum(diag(L(k)))/n1)^2 return(a) } khopt=optim(c(2.1),GCV1,method="L-BFGS-B",lower=c(0.00),upper=c(8.9))$par khopt; GCV=function(k){ a=1/n*t(log(yg)-yhat(k))%*%(log(yg)-yhat(k))/(1-sum(diag(L(k)))/n)^2 return(a) } sek=c() for(i in 1:4000){ sek[1]=GCV(0.00) sek[i+1]=GCV(i/1000) } tt2=(0:4000)/1000 mm2=cbind(tt2,sek) m2=mm2[sort.list(mm2[,2]),] k1=m2[1,1];k1 mlab <- expression(k) plab <- "GCV of RLTSE" plot(tt2,sek,xlab= mlab,type="l",ylab = plab,pch="",cex.axis=0.6,cex.lab=1.) title((~italic(k[opt]==0.143 ))) ##############################################