# Rproject11_Tablets_TwoSampleT.r
# 1.0 Read in data ----
#       See Example 12.2A of Rice
# 
#   Measurements of chlorpheniramine maleate in tablets made by seven laboratories
#     Nominal dosage equal to 4mg
#     10 measurements per laboratory
#
#     Sources of variability
#         within labs
#         between labs


# Note: read.table has trouble parsing header row
if (FALSE){tablets1=read.table(file="Rice 3e Datasets/ASCII Comma/Chapter 12/tablets1.txt",
                   sep=",",stringsAsFactors = FALSE, quote="\'",
                   header=TRUE)
}
# Read in matrix and label columns
tablets1=read.table(file="Rice 3e Datasets/ASCII Comma/Chapter 12/tablets1.txt",
                    sep=",",stringsAsFactors = FALSE, skip=1,
                    header=FALSE)
dimnames(tablets1)[[2]]<-paste("Lab",c(1:7),sep="")

tablets1
##    Lab1 Lab2 Lab3 Lab4 Lab5 Lab6 Lab7
## 1  4.13 3.86 4.00 3.88 4.02 4.02 4.00
## 2  4.07 3.85 4.02 3.88 3.95 3.86 4.02
## 3  4.04 4.08 4.01 3.91 4.02 3.96 4.03
## 4  4.07 4.11 4.01 3.95 3.89 3.97 4.04
## 5  4.05 4.08 4.04 3.92 3.91 4.00 4.10
## 6  4.04 4.01 3.99 3.97 4.01 3.82 3.81
## 7  4.02 4.02 4.03 3.92 3.89 3.98 3.91
## 8  4.06 4.04 3.97 3.90 3.89 3.99 3.96
## 9  4.10 3.97 3.98 3.97 3.99 4.02 4.05
## 10 4.04 3.95 3.98 3.90 4.00 3.93 4.06
#   Replicate Figure 12.1 of Rice
boxplot(tablets1)

#   2. Comparing Two Independent Samples ----
boxplot(tablets1[,c("Lab4","Lab7")])

x=tablets1$Lab4
y=tablets1$Lab7

#   2.1 Define a function to implement Two-sample t test ---- 
fcn.TwoSampleTTest<-function(x,y, conf.level=0.95, digits0=4){
  # conf.level=0.95; digits0=4
x.mean=mean(x)
y.mean=mean(y)
x.var=var(x)
y.var=var(y)
x.n=length(x)
y.n=length(y)
sigmasq.pooled=((x.n-1)*x.var + (y.n-1)*y.var)/(x.n + y.n -2)
# Print out statistics for each sample and pooled estimate of standard deviation
cat("Sample Statistics:\n")
print(t(data.frame(
  x.mean,
  x.n,
  x.stdev=sqrt(x.var),
  y.mean,
  y.n,
  y.stdev=sqrt(y.var),
  sigma.pooled=sqrt(sigmasq.pooled)
  )))

cat("\n t test Computations")
mean.diff=x.mean-y.mean
mean.diff.sterr=sqrt(sigmasq.pooled)*sqrt( 1/x.n + 1/y.n)
mean.diff.tstat=mean.diff/mean.diff.sterr
tstat.df=x.n + y.n -2
mean.diff.tstat.pvalue=2*pt(-abs(mean.diff.tstat), df=tstat.df)

print(t(data.frame(
  mean.diff=mean.diff,
  mean.diff.sterr=mean.diff.sterr,
  tstat=mean.diff.tstat,
  df=tstat.df,
  pvalue=mean.diff.tstat.pvalue)))

# Confidence interval for difference
alphahalf=(1-conf.level)/2.
t.critical=qt(1-alphahalf, df=tstat.df)

mean.diff.confInterval=mean.diff +c(-1,1)*mean.diff.sterr*t.critical
cat(paste("\n", 100*conf.level, " Percent Confidence Interval:\n "))
cat(paste(c(
  "[", round(mean.diff.confInterval[1],digits=digits0), 
  ",", round(mean.diff.confInterval[2],digits=digits0),"]"), collapse=""))

#    invisible(return(NULL))
}

#  2.2 Apply function fcn.TwoSampleTTest ----


# Compare Lab7 to Lab4
fcn.TwoSampleTTest(tablets1$Lab7, tablets1$Lab4)
## Sample Statistics:
##                     [,1]
## x.mean        3.99800000
## x.n          10.00000000
## x.stdev       0.08482662
## y.mean        3.92000000
## y.n          10.00000000
## y.stdev       0.03333333
## sigma.pooled  0.06444636
## 
##  t test Computations                       [,1]
## mean.diff        0.07800000
## mean.diff.sterr  0.02882129
## tstat            2.70633286
## df              18.00000000
## pvalue           0.01445587
## 
##  95  Percent Confidence Interval:
##  [0.0174,0.1386]
#

#  2.3 Compare to built-in r function t.test() ----
t.test(tablets1$Lab7, tablets1$Lab4, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  tablets1$Lab7 and tablets1$Lab4
## t = 2.7063, df = 18, p-value = 0.01446
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.01744872 0.13855128
## sample estimates:
## mean of x mean of y 
##     3.998     3.920
#  2.4 Compare to Linear Regression ----

x=tablets1$Lab4
y=tablets1$Lab7

#   Create yvec and  xmat matrix
yvec=c(x,y)
xmat.col1=0*yvec +1
xmat.col2=c(0*x, (1 + 0*y))
xmat=cbind(xmat.col1, xmat.col2)

plot(xmat.col2, yvec)
#   2.4.1 Fit linear regression
lmfit=lm(yvec ~ xmat.col2,x=TRUE, y=TRUE)
abline(lmfit, col='green')

# x matrix used by lm() is same as xmat
#   print(abs(xmat-lmfit$x))

print(summary(lmfit))
## 
## Call:
## lm(formula = yvec ~ xmat.col2, x = TRUE, y = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1880 -0.0245  0.0010  0.0440  0.1020 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.92000    0.02038 192.348   <2e-16 ***
## xmat.col2    0.07800    0.02882   2.706   0.0145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06445 on 18 degrees of freedom
## Multiple R-squared:  0.2892, Adjusted R-squared:  0.2497 
## F-statistic: 7.324 on 1 and 18 DF,  p-value: 0.01446
# t value / P-value for xmat.col2 estimate same as two-sample t test

# F-statistic in lm()  equal to square of t value
# (p-values of F and t are same)