# Rproject11_Tablets_TwoSampleT.r
# 1.0 Read in data ----
#       See Example 12.2A of Rice
# 
#   Measurements of chlorpheniramine maleate in tablets made by seven laboratories
#     Nominal dosage equal to 4mg
#     10 measurements per laboratory
#
#     Sources of variability
#         within labs
#         between labs


# Note: read.table has trouble parsing header row
if (FALSE){tablets1=read.table(file="Rice 3e Datasets/ASCII Comma/Chapter 12/tablets1.txt",
                   sep=",",stringsAsFactors = FALSE, quote="\'",
                   header=TRUE)
}
# Read in matrix and label columns
tablets1=read.table(file="Rice 3e Datasets/ASCII Comma/Chapter 12/tablets1.txt",
                    sep=",",stringsAsFactors = FALSE, skip=1,
                    header=FALSE)
dimnames(tablets1)[[2]]<-paste("Lab",c(1:7),sep="")

tablets1
##    Lab1 Lab2 Lab3 Lab4 Lab5 Lab6 Lab7
## 1  4.13 3.86 4.00 3.88 4.02 4.02 4.00
## 2  4.07 3.85 4.02 3.88 3.95 3.86 4.02
## 3  4.04 4.08 4.01 3.91 4.02 3.96 4.03
## 4  4.07 4.11 4.01 3.95 3.89 3.97 4.04
## 5  4.05 4.08 4.04 3.92 3.91 4.00 4.10
## 6  4.04 4.01 3.99 3.97 4.01 3.82 3.81
## 7  4.02 4.02 4.03 3.92 3.89 3.98 3.91
## 8  4.06 4.04 3.97 3.90 3.89 3.99 3.96
## 9  4.10 3.97 3.98 3.97 3.99 4.02 4.05
## 10 4.04 3.95 3.98 3.90 4.00 3.93 4.06
#   Replicate Figure 12.1 of Rice
boxplot(tablets1)

#   2.1 Define a function to implement Two-sample t test ---- 
fcn.TwoSampleTTest<-function(x,y, conf.level=0.95, digits0=4){
  # conf.level=0.95; digits0=4
x.mean=mean(x)
y.mean=mean(y)
x.var=var(x)
y.var=var(y)
x.n=length(x)
y.n=length(y)
sigmasq.pooled=((x.n-1)*x.var + (y.n-1)*y.var)/(x.n + y.n -2)
# Print out statistics for each sample and pooled estimate of standard deviation
cat("Sample Statistics:\n")
print(t(data.frame(
  x.mean,
  x.n,
  x.stdev=sqrt(x.var),
  y.mean,
  y.n,
  y.stdev=sqrt(y.var),
  sigma.pooled=sqrt(sigmasq.pooled)
  )))

cat("\n t test Computations")
mean.diff=x.mean-y.mean
mean.diff.sterr=sqrt(sigmasq.pooled)*sqrt( 1/x.n + 1/y.n)
mean.diff.tstat=mean.diff/mean.diff.sterr
tstat.df=x.n + y.n -2
mean.diff.tstat.pvalue=2*pt(-abs(mean.diff.tstat), df=tstat.df)

print(t(data.frame(
  mean.diff=mean.diff,
  mean.diff.sterr=mean.diff.sterr,
  tstat=mean.diff.tstat,
  df=tstat.df,
  pvalue=mean.diff.tstat.pvalue)))

# Confidence interval for difference
alphahalf=(1-conf.level)/2.
t.critical=qt(1-alphahalf, df=tstat.df)

mean.diff.confInterval=mean.diff +c(-1,1)*mean.diff.sterr*t.critical
cat(paste("\n", 100*conf.level, " Percent Confidence Interval:\n "))
cat(paste(c(
  "[", round(mean.diff.confInterval[1],digits=digits0), 
  ",", round(mean.diff.confInterval[2],digits=digits0),"]"), collapse=""))

#    invisible(return(NULL))
}

#  2.2 Apply function fcn.TwoSampleTTest ----


# Compare Lab7 to Lab4
fcn.TwoSampleTTest(tablets1$Lab7, tablets1$Lab4)
## Sample Statistics:
##                     [,1]
## x.mean        3.99800000
## x.n          10.00000000
## x.stdev       0.08482662
## y.mean        3.92000000
## y.n          10.00000000
## y.stdev       0.03333333
## sigma.pooled  0.06444636
## 
##  t test Computations                       [,1]
## mean.diff        0.07800000
## mean.diff.sterr  0.02882129
## tstat            2.70633286
## df              18.00000000
## pvalue           0.01445587
## 
##  95  Percent Confidence Interval:
##  [0.0174,0.1386]
# Compare output to t.test
t.test(tablets1$Lab7, tablets1$Lab4, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  tablets1$Lab7 and tablets1$Lab4
## t = 2.7063, df = 18, p-value = 0.01446
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.01744872 0.13855128
## sample estimates:
## mean of x mean of y 
##     3.998     3.920
#   3.1 Conduct One-Way ANOVA ----
#       Analysis of Variance with R function aov()

#   Create vector variables
#     yvec = Dependent variable 
yvec=as.vector(data.matrix(tablets1))

#     xvec.factor.Lab = Independent variable (of factor type in R)

xvec.factor.Lab=as.factor(as.vector(col(data.matrix(tablets1))))
table(xvec.factor.Lab)
## xvec.factor.Lab
##  1  2  3  4  5  6  7 
## 10 10 10 10 10 10 10
plot(as.numeric(xvec.factor.Lab), yvec)

# One-way ANOVA using r function aov()
tablets1.aov<-aov(yvec ~ xvec.factor.Lab)
tablets1.aov.summary<-summary(tablets1.aov)

# Replicate Table in Example 12.2.A, p. 483 of Rice
print(tablets1.aov.summary)
##                 Df Sum Sq  Mean Sq F value   Pr(>F)    
## xvec.factor.Lab  6 0.1247 0.020790    5.66 9.45e-05 ***
## Residuals       63 0.2314 0.003673                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Construct model tables

tablets1.aov.model.tables<-model.tables(tablets1.aov, type="means",se=TRUE)
print(tablets1.aov.model.tables)
## Tables of means
## Grand mean
##          
## 3.984571 
## 
##  xvec.factor.Lab 
## xvec.factor.Lab
##     1     2     3     4     5     6     7 
## 4.062 3.997 4.003 3.920 3.957 3.955 3.998 
## 
## Standard errors for differences of means
##         xvec.factor.Lab
##                  0.0271
## replic.              10
# Validate standard error for difference of means
ResidualsMeanSq=0.003673
J=nrow(tablets1)
sqrt(ResidualsMeanSq/J) # standard error of each mean
## [1] 0.01916507
sqrt(ResidualsMeanSq/J)*sqrt(2) # standard error of difference of two means
## [1] 0.02710351
#tablets1.aov.model.tables<-model.tables(tablets1.aov, type="effects",se=TRUE)
#print(tablets1.aov.model.tables)

#   3.2 Confidence intervals for pairwise differences ----
# Create confidence intervals on differences between means
#   Studentized range statistic
#   Tukey's 'Honest Significant Difference' method
# Apply R function TukeyHSD(): 
TukeyHSD(tablets1.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = yvec ~ xvec.factor.Lab)
## 
## $xvec.factor.Lab
##       diff          lwr          upr     p adj
## 2-1 -0.065 -0.147546752  0.017546752 0.2165897
## 3-1 -0.059 -0.141546752  0.023546752 0.3226101
## 4-1 -0.142 -0.224546752 -0.059453248 0.0000396
## 5-1 -0.105 -0.187546752 -0.022453248 0.0045796
## 6-1 -0.107 -0.189546752 -0.024453248 0.0036211
## 7-1 -0.064 -0.146546752  0.018546752 0.2323813
## 3-2  0.006 -0.076546752  0.088546752 0.9999894
## 4-2 -0.077 -0.159546752  0.005546752 0.0830664
## 5-2 -0.040 -0.122546752  0.042546752 0.7578129
## 6-2 -0.042 -0.124546752  0.040546752 0.7140108
## 7-2  0.001 -0.081546752  0.083546752 1.0000000
## 4-3 -0.083 -0.165546752 -0.000453248 0.0478900
## 5-3 -0.046 -0.128546752  0.036546752 0.6204148
## 6-3 -0.048 -0.130546752  0.034546752 0.5720976
## 7-3 -0.005 -0.087546752  0.077546752 0.9999964
## 5-4  0.037 -0.045546752  0.119546752 0.8178759
## 6-4  0.035 -0.047546752  0.117546752 0.8533629
## 7-4  0.078 -0.004546752  0.160546752 0.0760155
## 6-5 -0.002 -0.084546752  0.080546752 1.0000000
## 7-5  0.041 -0.041546752  0.123546752 0.7362355
## 7-6  0.043 -0.039546752  0.125546752 0.6912252
#
# Compare Tukey HSD confidence interval for Lab7 vs Lab4
fcn.TwoSampleTTest(tablets1$Lab7,tablets1$Lab4)
## Sample Statistics:
##                     [,1]
## x.mean        3.99800000
## x.n          10.00000000
## x.stdev       0.08482662
## y.mean        3.92000000
## y.n          10.00000000
## y.stdev       0.03333333
## sigma.pooled  0.06444636
## 
##  t test Computations                       [,1]
## mean.diff        0.07800000
## mean.diff.sterr  0.02882129
## tstat            2.70633286
## df              18.00000000
## pvalue           0.01445587
## 
##  95  Percent Confidence Interval:
##  [0.0174,0.1386]
# Note P-value from t-test is 0.0144 while TukeyHSD is 0.076

# 4. Implement One-Way Anova with lm() ----
lmfit.oneway.Lab=lm(yvec ~ xvec.factor.Lab,x=TRUE, y=TRUE)
lmfit.oneway.Lab.summary<-summary(lmfit.oneway.Lab)
print(lmfit.oneway.Lab.summary)
## 
## Call:
## lm(formula = yvec ~ xvec.factor.Lab, x = TRUE, y = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1880 -0.0245  0.0060  0.0410  0.1130 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.06200    0.01917 211.948  < 2e-16 ***
## xvec.factor.Lab2 -0.06500    0.02710  -2.398 0.019449 *  
## xvec.factor.Lab3 -0.05900    0.02710  -2.177 0.033248 *  
## xvec.factor.Lab4 -0.14200    0.02710  -5.239 1.99e-06 ***
## xvec.factor.Lab5 -0.10500    0.02710  -3.874 0.000257 ***
## xvec.factor.Lab6 -0.10700    0.02710  -3.948 0.000201 ***
## xvec.factor.Lab7 -0.06400    0.02710  -2.361 0.021316 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06061 on 63 degrees of freedom
## Multiple R-squared:  0.3503, Adjusted R-squared:  0.2884 
## F-statistic:  5.66 on 6 and 63 DF,  p-value: 9.453e-05
# Compare regression output from lm() with anova output from aov()
print(tablets1.aov.summary)
##                 Df Sum Sq  Mean Sq F value   Pr(>F)    
## xvec.factor.Lab  6 0.1247 0.020790    5.66 9.45e-05 ***
## Residuals       63 0.2314 0.003673                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Residual standard error equals root (Mean Sq for Residuals)
print(lmfit.oneway.Lab.summary$sigma)
## [1] 0.06060541
print((lmfit.oneway.Lab.summary$sigma)^2)
## [1] 0.003673016
# Compare to Mean Sq for Residuals in tablets1.aov.summary

# F-statistic/degrees of freedom/P-values are same