#Simple Regression
sqrt(9)
sin(2)
2/5
c(1,2,4)
mdat <- matrix(c(1,2,3, 11,12,13), nrow = 2, ncol = 3, 
               dimnames = list(c("row1", "row2"),
                               c("C.1", "C.2", "C.3")))
mdat
y=c(70,65,90,95,110,115,120,140,155,150)
x=c(80,100,120,140,160,180,200,220,240,260)
model=lm(y~-1+x)
mean(x)
sum(x^2)
summary(model)

Y=c(1004,1632,852,1506,1372,1270,1269,903,1555,1260,1146,1276,1225,1321)
X1=c(195,255,255,255,225,225,225,195,225,225,225,225,225,230)
X2=c(4,4,4.6,4.6,4.2,4.1,4.6,4.3,4.3,4,4.7,4.3,4.72,4.3)
model2=lm(Y~X1+X2)
summary(model2)
n=14
X=cbind(rep(1,14),X1,X2)
bhat=solve(t(X)%*%X)%*%t(X)%*%Y;bhat
SSR=t(bhat)%*%t(X)%*%Y-n*(mean(Y))^2;SSR
SSE=t(Y)%*%Y-t(bhat)%*%t(X)%*%Y;SSE
SSY=t(Y)%*%Y-n*(mean(Y))^2;SSY
MSE=SSE/(n-3)
C=solve(t(X)%*%X)
Q1=(bhat[2])^2/C[2,2]
F1=Q1/MSE;F1


USairpollution=matrix(c(46,11,24,47,11,31,110,23,65,26,9,17,17,35,56,10,28,14,14,13,30,10,10,16,29,18,9,31,14,69,10,61,94,26,28,12,29,56,29,8,36,47.6,56.8,61.5,55.0,47.1,55.2,50.6,54.0,49.7,51.5,66.2,51.9,49.0,49.9,49.1,68.9,52.3,68.4,54.5,61.0,55.6,61.6,75.5,45.7,43.5,59.4,68.3,59.3,51.5,54.6,70.3,50.4,50.0,57.8,51.0,56.7,51.1,55.9,57.3,56.6,54.0,44,46,368,652,391,35,3344,462,1007,266,641,454,104,1064,412,721,361,136,381,91,291,337,207,569,699,275,204,96,181,1692,213,347,343,197,137,453,379,775,434,125,80,116,244,497,905,463,71,3369,453,751,540,844,515,201,1513,158,1233,746,529,507,132,593,624,335,717,744,448,361,308,347,1950,582,520,179,299,176,716,531,622,757,277,80,8.8,8.9,9.1,9.6,12.4,6.5,10.4,7.1,10.9,8.6,10.9,9.0,11.2,10.1,9.0,10.8,9.7,8.8,10.0,8.2,8.3,9.2,9.0,11.8,10.6,7.9,8.4,10.6,10.9,9.6,6.0,9.4,10.6,7.6,8.7,8.7,9.4,9.5,9.3,12.7,9.0,33.36,7.77,48.34,41.31,36.11,40.75,34.44,39.04,34.99,37.01,35.94,12.95,30.85,30.96,43.37,48.19,38.74,54.47,37.00,48.52,43.11,49.10,59.80,29.07,25.94,46.00,56.77,44.68,30.18,39.93,7.05,36.22,42.75,42.59,15.17,20.66,38.79,35.89,38.89,30.58,40.25,135,58,115,111,166,148,122,132,155,134,78,86,103,129,127,103,121,116,99,100,123,105,128,123,137,119,113,116,98,115,36,147,125,115,89,67,164,105,111,82,114),41)
cities = c("Albany", "Albuquerque","Atlanta","Baltimore","Buffalo","Charleston","Chicago", "Cincinnati","Cleveland","Columbus","Dallas","Denver","DesMoines","Detroit","Hartford","Houston","Indianapolis","Jacksonville","Kansas City","Little Rock","Louisville","Memphis","Miami", "Milwaukee","Minneapolis","Nashville","New Orleans","Norfolk","Omaha","Philadelphia", "Phoenix","Pittsburgh","Providence","Richmond","Salt Lake City","San Francisco","Seattle","St. Louis","Washington","Wichita","Wilmington")
variables = c("SO2","temp","manu","popul","wind","precip","predays")
colnames(USairpollution) = variables
rownames(USairpollution)= cities
d<-
y<-d[,1]

x1<-d[,2]
x2<-d[,3]
x3<-d[,4]
data1=c(y,x1)

m1=lm(y~x1)
summary(m1)
m=lm(y~-1+USairpollution[,c(-1,-7)])
summary(m)
yhat<-fitted(m)
e=y-yhat
plot(e~yhat)
plot(e~x1)
plot(e~x2)
plot(e~x3)
qqnorm(e)
par(mfrow=c(1,4))
plot(m)

plot(x1,y)
abline(lm(y~x1))

yhat<-fitted(m1)
predict(m1)
confint(m1,level=0.95)

#regression diagnostics
lm.influence(m1)
par(mfrow=c(2,2))
plot(m1)
lev = hat(model.matrix(m1))
plot(lev)
data1[lev >0.2]
#hat : a vector containing the diagonal of the hat matrix.
#coefficients: a matrix whose i-th row contains the change in the estimated 
#coefficients which results when the i-th case is dropped from the regression. Note that aliased coefficients are not included in the matrix.
#sigma:a vector whose i-th element contains the estimate of the residual 
#standard deviation obtained when the i-th case is dropped from the regression (The approximations needed for GLMs can result in this being NaN.)
#wt.res: a vector of weighted residuals.


r<-resid(lm(y~x1))
sr=rstandard(m1)
dffits(m1)
cooks.distance(m1)
dfbeta(m1)
covratio(m1)

summary(mm)

####################Transformation in simple regression
library(alr3)
data(salarygov)
attach(salarygov)

#Figure 3.32 on page 96
m1 <- lm(MaxSalary~Score)
par(mfrow=c(2,2))
plot(Score,MaxSalary)
abline(m1,lty=2,col=2)
StanRes1 <- rstandard(m1)
absrtsr1 <- sqrt(abs(StanRes1))
plot(Score,StanRes1,ylab="Standardized Residuals")
plot(Score,absrtsr1,ylab="Square Root(|Standardized Residuals|)")
abline(lsfit(Score,absrtsr1),lty=2,col=2)

#Output from R on page 96
library(MASS)
library(car)
powerTransform(cbind(MaxSalary,Score))
summary(powerTransform(cbind(MaxSalary,Score)))

#Figure 3.33 on page 97
par(mfrow=c(3,2))
plot(density(MaxSalary,bw="SJ",kern="gaussian"),type="l",
main="Gaussian kernel density estimate",xlab="MaxSalary")
rug(MaxSalary)
boxplot(MaxSalary,ylab="MaxSalary")
qqnorm(MaxSalary, ylab = "MaxSalary")
qqline(MaxSalary, lty = 2, col=2)
plot(density(Score,bw="SJ",kern="gaussian"),type="l",
main="Gaussian kernel density estimate",xlab="Score")
rug(Score)
boxplot(Score,ylab="Score")
qqnorm(Score, ylab = "Score")
qqline(Score, lty = 2, col=2)

#Figure 3.34 on page 97
par(mfrow=c(1,1))
plot(sqrt(Score),log(MaxSalary),xlab=expression(sqrt(Score)))
abline(lsfit(sqrt(Score),log(MaxSalary)),lty=2,col=2)

#Figure 3.35 on page 98
par(mfrow=c(3,2))
plot(density(log(MaxSalary),bw="SJ",kern="gaussian"),type="l",
main="Gaussian kernel density estimate",xlab="log(MaxSalary)")
rug(log(MaxSalary))
boxplot(log(MaxSalary),ylab="log(MaxSalary)")
qqnorm(log(MaxSalary), ylab = "log(MaxSalary)")
qqline(log(MaxSalary), lty = 2, col=2)
sj <- bw.SJ(sqrt(Score),lower = 0.05, upper = 100)
plot(density(sqrt(Score),bw=sj,kern="gaussian"),type="l",
main="Gaussian kernel density estimate",xlab=expression(sqrt(Score)))
rug(sqrt(Score))
boxplot(sqrt(Score),ylab=expression(sqrt(Score)))
qqnorm(sqrt(Score), ylab=expression(sqrt(Score)))
qqline(sqrt(Score), lty = 2, col=2)

#Figure 3.36 on page 99
m2 <- lm(log(MaxSalary)~sqrt(Score))
par(mfrow=c(1,2))
StanRes2 <- rstandard(m2)
absrtsr2 <- sqrt(abs(StanRes2))
plot(sqrt(Score),StanRes2,ylab="Standardized Residuals",xlab=expression(sqrt(Score)))
plot(sqrt(Score),absrtsr2,ylab="Square Root(|Standardized Residuals|)",xlab=expression(sqrt(Score)))
abline(lsfit(sqrt(Score),absrtsr2),lty=2,col=2)
summary(m1)
summary(m2)



####################Transformation in multiple regression

magazines <- read.csv("magazines.csv", header=TRUE)
attach(magazines)

#R output on page 177
library(alr3)
summary(powerTransform(cbind(AdRevenue,AdPages,SubRevenue,NewsRevenue)))


#Figure 6.21 on page 178
pairs(AdRevenue~AdPages+SubRevenue+NewsRevenue)
m0=lm(AdRevenue~AdPages+SubRevenue+NewsRevenue)
#Figure 6.22 on page 179
tAdPages<- log(AdPages)
tSubRevenue <- log(SubRevenue)
tNewsRevenue <- log(NewsRevenue)
m1 <- lm(AdRevenue~log(AdPages)+log(SubRevenue)+log(NewsRevenue))
library(alr3)
par(mfrow=c(1,1))
inverse.response.plot(m1,key=TRUE)

#R output on page 179
library(alr3)
summary(tranxy <-powerTransform(cbind(AdRevenue,AdPages,SubRevenue,NewsRevenue))

#Figure 6.23 on page 180
pairs(log(AdRevenue)~log(AdPages)+log(SubRevenue)+log(NewsRevenue))

#Figure 6.24 on page 181
m2 <- lm(log(AdRevenue)~log(AdPages)+log(SubRevenue)+log(NewsRevenue))
summary(m0)
summary(m2)
par(mfrow=c(2,2))
StanRes2 <- rstandard(m2)
plot(log(AdPages),StanRes2,ylab="Standardized Residuals")
plot(log(SubRevenue),StanRes2,ylab="Standardized Residuals")
plot(log(NewsRevenue),StanRes2,ylab="Standardized Residuals")
plot(m2$fitted.values,StanRes2,ylab="Standardized Residuals",xlab="Fitted Values")

##########Variable Selection in Multiple Linear Regression############
###Ex. 1: Bridge Construction

bridge <- read.table("bridge.txt", header=TRUE)
attach(bridge)

library(alr3)
summary(tranxy <- powerTransform(cbind(Time,DArea,CCost,Dwgs,Length,Spans)))
testTransform(tranxy, c(0, 0, 0, 0,0,0))

# fit linear model with transformed response:
coef(tranxy, round=TRUE)
summary(lm(bcPower(Time,tranxy$lam)~bcPower(DArea,tranxy$lam)+bcPower(CCost,tranxy$lam)+bcPower(Dwgs,tranxy$lam)+bcPower(Length,tranxy$lam)+bcPower(Spans,tranxy$lam)))

# save the rounded transformed values, plot them with a separate
# color for each highway type
transformedY <- bcPower(with(bridge,cbind(Time,DArea,CCost,Dwgs,Length,Spans)),
                coef(tranxy, round=TRUE))
pairs(transformedY,col=as.numeric(bridge$Time)) 

#Figure 6.40 page 198
pairs(log(Time)~log(CCost)+log(Dwgs)+log(Length)+log(Spans)+log(DArea),data=bridge,pch="*")

#Figure 6.41 page 199
m0<- lm(log(Time)~log(CCost)+log(Dwgs)+log(Length)+log(Spans)+log(DArea))
summary(m0)
StanRes1 <- rstandard(m0)
par(mfrow=c(2,3))
plot(log(DArea),StanRes1, ylab="Standardized Residuals")
plot(log(CCost),StanRes1, ylab="Standardized Residuals")
plot(log(Dwgs),StanRes1, ylab="Standardized Residuals")
plot(log(Length),StanRes1, ylab="Standardized Residuals")
plot(log(Spans),StanRes1, ylab="Standardized Residuals")
plot(m0$fitted.values,StanRes1, ylab="Standardized Residuals",xlab="Fitted values")

#Figure 6.42 page 199
par(mfrow=c(1,1))
plot(m0$fitted.values,log(Time),xlab="Fitted Values")
abline(lsfit(m0$fitted.values,log(Time)))

#Figure 6.43 page 200
par(mfrow=c(2,2))
plot(m0)
abline(h=-2,lty=2)
abline(h=2,lty=2)
identify(plot(m0))

abline(v=2*6/45,lty=2)

#Regression output on page 200
summary(m0)

#Figure 6.44 page 201
library(alr3)
mmps(m0,layout=c(2,3))

#R output on page 202
logDArea <- log(DArea)
logCCost <- log(CCost)
logDwgs <- log(Dwgs)
logLength <- log(Length)
logSpans <- log(Spans)
y<-log(Time)
X <- cbind(logCCost,logDwgs,logLength,logSpans,logDArea)
c <- cor(X)
round(c,4);kappa(t(X)%*%X)
vif(m0)

#Figure 6.45 on page 202
library(car)
par(mfrow=c(2,3))
avPlot(m0,variable=log(CCost),ask=FALSE,id=TRUE,col.lines=palette()[1],pch="*",lwd=2)
avPlot(m0,variable=log(Dwgs),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2)
avPlot(m0,variable=log(Length),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2)
avPlot(m0,variable=log(Spans),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2)
avPlot(m0,variable=log(DArea),ask=FALSE,id=FALSE,col.lines=palette()[1],pch="*",lwd=2)


summary(lm(y~X))
m1<-lm(log(Time)~log(CCost)+log(Dwgs)+log(Spans));summary(m1)
#R output on page 203
library(car)
vif(m1)
syy=sum(y^2)-n*(mean(y))^2

n=45
X=cbind(log(CCost),log(Dwgs),log(Spans))
lam<-eigen(t(X)%*%X)$values;sqrt(max(lam)/min(lam))

m1 <- lm(log(Time)~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans))
logDArea <- log(DArea)
logCCost <- log(CCost)
logDwgs <- log(Dwgs)
logLength <- log(Length)
logSpans <- log(Spans)
X <- cbind(logDArea,logCCost,logDwgs,logLength,logSpans)
install.packages("leaps")
library(leaps)
b <- regsubsets(as.matrix(X),log(Time))
rs <- summary(b)
par(mfrow=c(1,2))
plot(1:5,rs$adjr2,xlab="Subset Size",ylab="Adjusted R-squared")
library(car)
subsets(b,statistic=c("adjr2"))

#Table 7.1 on page 235
#Calculate adjusted R-squared
rs$adjr2
rs$outmat

om1 <- lm(log(Time)~log(Dwgs))
om2 <- lm(log(Time)~log(Dwgs)+log(Spans))
om3 <- lm(log(Time)~log(Dwgs)+log(Spans)+log(CCost))
om4 <- lm(log(Time)~log(Dwgs)+log(Spans)+log(CCost)+log(DArea))
om5 <- m1
#Subset size=1
n <- length(om1$residuals)
npar <- length(om1$coefficients) +1
#Calculate AIC
extractAIC(om1,k=2)
#Calculate AICc
extractAIC(om1,k=2)+2*npar*(npar+1)/(n-npar-1)
#Calculate BIC
extractAIC(om1,k=log(n))
#Subset size=2
npar <- length(om2$coefficients) +1
#Calculate AIC
extractAIC(om2,k=2)
#Calculate AICc
extractAIC(om2,k=2)+2*npar*(npar+1)/(n-npar-1)
#Calculate BIC
extractAIC(om2,k=log(n))
#Subset size=3
npar <- length(om3$coefficients) +1
#Calculate AIC
extractAIC(om3,k=2)
#Calculate AICc
extractAIC(om3,k=2)+2*npar*(npar+1)/(n-npar-1)
#Calculate BIC
extractAIC(om3,k=log(n))
#Subset size=4
npar <- length(om4$coefficients) +1
#Calculate AIC
extractAIC(om4,k=2)
#Calculate AICc
extractAIC(om4,k=2)+2*npar*(npar+1)/(n-npar-1)
#Calculate BIC
extractAIC(om4,k=log(n))
#Subset size=5
npar <- length(om5$coefficients) +1
#Calculate AIC
extractAIC(om5,k=2)
#Calculate AICc
extractAIC(om5,k=2)+2*npar*(npar+1)/(n-npar-1)
#Calculate BIC
extractAIC(om5,k=log(n))

#Regression output on pages 235 and 236
summary(om2)
summary(om3)

#Output from R on page 237
backAIC <- step(m1,direction="backward", data=bridge)
backBIC <- step(m1,direction="backward", data=bridge, k=log(n))

#Output from R on page 238
mint <- lm(log(Time)~1,data=bridge)
forwardAIC <- step(mint,scope=list(lower=~1, 
upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)),
direction="forward", data=bridge)
forwardBIC <- step(mint,scope=list(lower=~1, 
upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)),
direction="forward", data=bridge,k=log(n))
stepwise <- step(mint,scope=list(lower=~1, 
upper=~log(DArea)+log(CCost)+log(Dwgs)+log(Length)+log(Spans)),
direction="both", data=bridge)



fullmodel <- lm(log(Time)~log(DArea)+log(CCost)+log(Dwgs)+log(Length)
+log(Spans), data = bridge)
step(fullmodel, direction = "backward", trace=TRUE )

######Ridge Estimation################
y<-log(Time)
X <- cbind(logCCost,logDwgs,logSpans)
library(mctest)
omcdiag(X,y)
vif(om3)
x=cbind(rep(1,n),log(CCost),log(Dwgs),log(Spans))
C=t(x)%*%x

###GCV=[1/n*sum(y_i-\hat y_i)^2]/[1-tr(H)/n]^2;\hat y=Hy

L=function(k){
L=matrix(ncol=n,nrow=n)
a=x%*%solve(C+k*diag(4))%*%t(x)
return(a)
}
L(0)
yhat=function(k){
a=L(k)%*%y
return(a)
}

GCV1=function(kh){
k=kh[1]
a=1/n*t(y-yhat(k))%*%(y-yhat(k))/(1-sum(diag(L(k)))/n)^2
return(a)
}

khopt=optim(c(.01),GCV1,method="L-BFGS-B",lower=c(0.00),upper=c(18.9))$par
khopt;

bk=function(k){
a=solve(C+k*diag(4))%*%t(x)%*%y
return(a)}
bk(0)




GCV=function(k){
a=1/n*t(y-yhat(k))%*%(y-yhat(k))/(1-sum(diag(L(k)))/n)^2
return(a)
}
GCV(1)
bk(khopt);
syy=sum(y^2)-n*(mean(y))^2
1-GCV(1.04)/syy
1-GCV(khopt)/syy

sek=c()
for(i in 1:700){
sek[1]=GCV(0.00)
sek[i+1]=GCV(i/10000)
}

tt2=(0:700)/10000
mm2=cbind(tt2,sek)
m2=mm2[sort.list(mm2[,2]),]
k1=m2[1,1];k1
m2[1,]

mlab <- expression(k)
plab <- "GCV"
plot(tt2,sek,xlab= mlab,type="l",ylab = plab,pch="",cex.axis=0.6,cex.lab=1.)
title((~italic(k[opt]==0.0279)))



detach(bridge)


##################WAGE DATA SET######################
#Variable names in order from left to right:
	#EDUCATION: Number of years of education.
	#SOUTH: Indicator variable for Southern Region (1=Person lives in South, 0=Person lives elsewhere).
	#SEX: Indicator variable for sex (1=Female, 0=Male).
	#EXPERIENCE: Number of years of work experience.
	#UNION: Indicator variable for union membership (1=Union member, 	0=Not union member).
	#WAGE: Wage (dollars per hour).
	#AGE: Age (years).
	#RACE: Race (1=Other, 2=Hispanic, 3=White).
	#OCCUPATION: Occupational category (1=Management,2=Sales,3=Clerical,4=Service, 5=Professional, 6=Other).
	#SECTOR: Sector (0=Other, 1=Manufacturing, 2=Construction).
	#MARR: Marital Status (0=Unmarried,  1=Married)



data <- read.table("Wage.txt", header=TRUE)
attach(data)
wage
n=534

y<-wage
X<-data[,c(-6)]
cor(X)
library(mctest)
omcdiag(X,log(y))
cbind(log(y),rep(1,534),education,south,sex,experience,union,age,race,
occupation,sector,married)

##################################################


fit1<-lm(log(y)~education+south+sex+experience+union+age+race+occupation+sector+married)
summary(fit1)
anova(fit1)
r=resid(fit1)
sum(r^2)
plot(fit1)
pred0 <- fitted(fit1,X)
syy=sum(log(y)^2)-n*(mean(log(y)))^2
SSE=t(log(y)-pred0)%*%(log(y)-pred0);SSE;1-SSE/syy
x=as.matrix(cbind(rep(1,n),X))
bhat=solve(t(x)%*%x)%*%t(x)%*%log(y)
SSR=t(bhat)%*%t(x)%*%log(y)-n*(mean(log(y)))^2
SSR/syy;syy-SSR
mlab <- "Observations"
plab <- "Standardized Residuals"

plot(rstandard(fit1),ylim=c(-2.5,2.75),xlab=mlab,ylab=plab)
#abline(h=0)
abline(h=c(-2,2),lty=3,col="darkblue",lwd=2)
identify(rstandard(fit1)-.15)

qqnorm(rstandard(fit1),ylab=plab)
qqline(rstandard(fit1))


#-----------------------AVPLOTS

#plots against explanatory variables


m3=lm(log(wage)~education+south+sex+experience+union+age+race+occupation+sector+married)

library(car)

par(mfrow=c(1,3))
avPlot(m3,variable=education,pch="*",lwd=3,col.lines="darkblue")
avPlot(m3,variable=south,pch="*",lwd=3,col.lines="darkblue")
avPlot(m3,variable=sex,pch="*",lwd=3,col.lines="darkblue")

#####LTS#####################

library(robustbase)
mRobust=ltsReg(log(y)~education+south+sex+experience+union+age+race+
occupation+sector+married)


summary(mRobust)
plot(mRobust, which = "rqq")


out=which(mRobust$lts.wt==0);out

b0=mRobust$coef
yhat0=xg%*%b0


#####RLTS#####################
yg=(y[-out])
xg=x[-out,]
Xg=X[-out,]
summary(m0<-lm(log(yg)~as.matrix(Xg)))
syyg=sum(log(yg)^2)-(length(yg))*(mean(log(yg)))^2
syy=sum(log(y)^2)-(length(y))*(mean(log(y)))^2

n1=length(y[-out])

C0=t(xg)%*%xg
bk=function(k){
a=solve(C0+k*diag(11))%*%t(xg)%*%yg
return(a)}
bk(0.1)


#GCV0#####################################################

L=function(k){
L=matrix(ncol=n1,nrow=n1)
a=xg%*%solve(C0+k*diag(11))%*%t(xg)
return(a)
}

yhat=function(k){
a=L(k)%*%log(yg)
return(a)
}

GCV1=function(kh){
k=kh[1]
a=1/n1*t(log(yg)-yhat(k))%*%(log(yg)-yhat(k))/(1-sum(diag(L(k)))/n1)^2
return(a)
}

khopt=optim(c(2.1),GCV1,method="L-BFGS-B",lower=c(0.00),upper=c(8.9))$par
khopt;


GCV=function(k){
a=1/n*t(log(yg)-yhat(k))%*%(log(yg)-yhat(k))/(1-sum(diag(L(k)))/n)^2
return(a)
}

sek=c()
for(i in 1:4000){
sek[1]=GCV(0.00)
sek[i+1]=GCV(i/1000)
}

tt2=(0:4000)/1000
mm2=cbind(tt2,sek)
m2=mm2[sort.list(mm2[,2]),]
k1=m2[1,1];k1

mlab <- expression(k)
plab <- "GCV of RLTSE"
plot(tt2,sek,xlab= mlab,type="l",ylab = plab,pch="",cex.axis=0.6,cex.lab=1.)
title((~italic(k[opt]==0.143 )))





##############################################