# Rproject3_script1_multinomial_simulation.r
#
# Parametric Bootstrap simulation of sampling distributions for alternate estimators

# Multinomial Counts:  Hardy Weinberg Equilibrium

# 1.1 Trinomial data----
#     from Example 8.5.1.A, p. 273 of Rice.
#
# x=(X1,X2,X3)  # counts of multinomial cells (1,2,3)

# Erythrocyte antigen blood types of n=1029 Hong Kong population in 1937.
x<-c(342,500,187)

# 1.2 Two estimators:  Multinomial MLE and Binomial(X3) MLE ----

x.n=sum(x)
x.theta.mle=(2*x[3] + x[2])/(2*sum(x))
x.theta.binomialx3=sqrt(x[3]/sum(x))

print(x.theta.mle)
## [1] 0.4246842
print(x.theta.binomialx3)
## [1] 0.4262978
# 2.0 Simulate sampling distribution of estimators ----

#   1000 trials (the sampling distribution) of the two estimators

# For each trial, generate  sample  of size n=1027 from the multinomial distribution
#     w.samplespace=c(1,2,3); the outcomes/cells of the multinomial 
#     prob=fcn.probs.hardyweinberg(x.theta.mle); the cell probilities
#   
#   Compute estimates of the Hardy-Weinberg theta parameter 
#     by Multinomial MLE and Binomial(X3) MLE

# 2.1 R functions for the simulation ----

# function computing cell probabilities given Hardy-Weinberg theta parameter
fcn.probs.hardyweinberg<-function(theta0){
  probs=c((1-theta0)**2, 2*theta0*(1-theta0), theta0**2)
  return(probs)
}

# function computing cell counts given w.sample, a sample of single-outcome multinomial
# random variables (comparable to Bernoulli outcomes underlying a binomial)
fcn.w.sample.counts<-function(w.sample, w.samplespace){
  result=0*w.samplespace
  for (j.outcome in c(1:length(w.samplespace))){
    result[j.outcome]<-sum(w.sample==w.samplespace[j.outcome])
    }
  return(result)
}

# 2.2 Validate functions for single trial example ----

args(sample)
## function (x, size, replace = FALSE, prob = NULL) 
## NULL
w.samplespace=c(1,2,3)

w.sample= sample(x=w.samplespace,size=x.n,replace=TRUE, 
        prob=fcn.probs.hardyweinberg(x.theta.mle))
w.sample.counts<-fcn.w.sample.counts(w.sample, w.samplespace)

par(mfcol=c(1,1))
plot(w.sample,main=paste("Single Multinomial Trial (n=", as.character(x.n),")",sep=""))

print(table(w.sample))
## w.sample
##   1   2   3 
## 351 489 189
print(w.sample.counts)
## [1] 351 489 189
# 2.3 Conduct Simulation ----

n.simulations=20000

data.simulations<-matrix(NA,nrow=n.simulations, ncol=2)
dimnames(data.simulations)[[2]]<-c("MLE","Alternate")
for (j.simulation in c(1:n.simulations)){
  
  j.w.sample= sample(x=w.samplespace,size=x.n,replace=TRUE, 
                   prob=fcn.probs.hardyweinberg(x.theta.mle))
  j.w.sample.counts<-fcn.w.sample.counts(j.w.sample, w.samplespace)
  
  x.j=j.w.sample.counts
  # print(x.j)
  
x.j.theta.mle=(2*x.j[3] + x.j[2])/(2*sum(x.j))
x.j.theta.binomialx3=sqrt(x.j[3]/sum(x.j))
  data.simulations[j.simulation,1]=x.j.theta.mle
  data.simulations[j.simulation,2]=x.j.theta.binomialx3
}

# 3.  Simulation Results ----


# 3.1 Histogram of estimators sampling distributions -----
par(mfcol=c(2,1))
print(simulations.means<-apply(data.simulations,2,mean))
##       MLE Alternate 
## 0.4248393 0.4246122
print(simulation.stdevs<-sqrt(apply(data.simulations,2,var)))
##        MLE  Alternate 
## 0.01083837 0.01402613
hist(data.simulations[,1], main="Multinomial MLE")
abline(v=simulations.means[1] +c(-1,0,1)*simulation.stdevs[1], col=c(3,3,3))

hist(data.simulations[,2], main="Binomial(X3) MLE")
abline(v=simulations.means[2] +c(-1,0,1)*simulation.stdevs[2], col=c(2,2,2))

# 3.2 Histograms of estimation errors for each estimator ----
data.simulations.error=data.simulations-x.theta.mle

par(mfcol=c(2,1))
hist(data.simulations.error[,1] , main="Error of Multinomial MLE")
print(simulations.error.means<-apply(data.simulations.error,2,mean))
##           MLE     Alternate 
##  1.551749e-04 -7.192265e-05
print(simulations.error.stdevs<-sqrt(apply(data.simulations.error,2,var)))
##        MLE  Alternate 
## 0.01083837 0.01402613
abline(v=simulations.error.means[1] +c(-1,0,1)*simulations.error.stdevs[1], col=c(3,3,3))


hist(data.simulations.error[,2], main="Error of Binomial(X3) MLE")
abline(v=simulations.error.means[2] +c(-1,0,1)*simulations.error.stdevs[2], col=c(2,2,2))

# 3.3 Boxplot comparing sampling distributions ----
par(mfcol=c(1,1))
boxplot(data.simulations, 
        main="Simulated Sampling Distribution\nHardy-Weinberg Parameter")

# 3.4 Scatterplot of joint distribution of estimates ----
plot(data.simulations[,1], data.simulations[,2],xlab="Multinomial MLE", ylab="Binomial(X3) MLE")
abline(v=simulations.means[1] +c(-1,0,1)*simulation.stdevs[1], col=c(3,3,3))
abline(h=simulations.means[2] +c(-1,0,1)*simulation.stdevs[2], col=c(2,2,2))

# 4. Bootstrap Confidence Interval ----

head(data.simulations.error)
##                MLE    Alternate
## [1,] -0.0004859086 -0.004124118
## [2,]  0.0126336249  0.022746688
## [3,] -0.0068027211 -0.008771333
## [4,]  0.0034013605  0.002751979
## [5,]  0.0048590865 -0.007604675
## [6,]  0.0072886297  0.013972683
quantile(data.simulations.error[,1],probs=c(.05,.95))
##          5%         95% 
## -0.01797862  0.01797862
quantile(data.simulations.error[,2],probs=c(.05,.95))
##          5%         95% 
## -0.02303547  0.02274669
x.theta.mle
## [1] 0.4246842
x.theta.binomialx3
## [1] 0.4262978
# 4.1 Approximate 90% confidence interval based on MLE ----

approx.CI.limits.90percent.mle=x.theta.mle +c(- quantile(data.simulations.error[,1], probs=.95),
                                              - quantile(data.simulations.error[,1],probs=.05))
approx.CI.limits.90percent.mle
##       95%        5% 
## 0.4067055 0.4426628
# 4.2 Approximate 90% confidence interval based on BINOMIALX3 ----

approx.CI.limits.90percent.binomialx3=x.theta.binomialx3 +c(- quantile(data.simulations.error[,2], probs=.95),
                                                            - quantile(data.simulations.error[,2],probs=.05))
approx.CI.limits.90percent.binomialx3
##       95%        5% 
## 0.4035511 0.4493333