# Rproject10_flow_occ_regressions.r
# 1.0 Read in data ----
#       See Problem 10.9.50, 
#       data from:  http://pems.eecs.berkeley.edu
# For each of three lanes, the 
#   flow (number of cars)
#   occupancy (percentage of time a car was over the loop)
#
#     1740  5-minute intervals
#     Lane 1 farthest left lane, lane 2 center, lane 3 farthest right


flowocc=read.table(file="Rice 3e Datasets/ASCII Comma/Chapter 10/flow-occ.txt",
                   sep=",",stringsAsFactors = FALSE,
                   header=TRUE)

Timestamp2 = strptime(flowocc$Timestamp, "%m/%d/%Y %H:%M:%S")
#plot(Timestamp2, flowocc$Lane.1.Occ)
#plot(flowocc$Lane.1.Occ)
flowocc$Timestamp=Timestamp2

lmfit1=lm(Lane.3.Occ ~ Lane.1.Occ, data=flowocc)
plot(flowocc$Lane.1.Occ, flowocc$Lane.3.Occ)

lmfit1=lm(Lane.3.Occ ~ Lane.1.Occ, data=flowocc)
abline(lmfit1,col="green")

plot(flowocc$Lane.1.Occ, lmfit1$residuals)
abline(h=0,col="gray")

qqnorm(lmfit1$residuals)

# Consider two subsets

ind.subset1=(flowocc$Lane.1.Occ < .18)

ind.subset2=(flowocc$Lane.1.Occ > .18)

# For first subset:

plot(flowocc$Lane.1.Occ[ind.subset1], flowocc$Lane.3.Occ[ind.subset1])

lmfit1.subset1=lm(Lane.3.Occ ~ Lane.1.Occ, data=flowocc, weight=1*ind.subset1)
abline(lmfit1.subset1,col="green")

plot(flowocc$Lane.1.Occ[ind.subset1], lmfit1.subset1$residuals[ind.subset1])

abline(h=0,col="gray")

qqnorm(lmfit1.subset1$residuals[ind.subset1])

# For second subuset:

plot(flowocc$Lane.1.Occ[ind.subset2], flowocc$Lane.3.Occ[ind.subset2])

lmfit1.subset2=lm(Lane.3.Occ ~ Lane.1.Occ, data=flowocc, weight=1*ind.subset2)
abline(lmfit1.subset2,col="green")

plot(flowocc$Lane.1.Occ[ind.subset2], lmfit1.subset2$residuals[ind.subset2])

abline(h=0,col="gray")

qqnorm(lmfit1.subset2$residuals[ind.subset2])

# For second subuset: