clumps = c(0,1,2,3,4,5,6,7,8,9,10,19) squares = c(56,104,80,62,42,27,9,9,5,3,2,1) # Rice austen example p 486-488 ---- sense_and_sensibility = c(147,25,32,94,59,18) emma = c(186,26,39,105,74,10) sanditonI = c(101,11,15,37,28,10) rownames = c('a', 'an', 'this', 'that', 'with', 'without') observeddata = data.frame(sense_and_sensibility,emma,sanditonI, row.names = rownames) print(observeddata) nwordsPerBook = colSums(observeddata) ntotal = sum(observeddata) nullProbs = rowSums(observeddata)/ntotal ss = sum(observeddata$sense_and_sensibility) * nullProbs em = sum(observeddata$emma) * nullProbs sI = sum(observeddata$sanditonI) * nullProbs expected = data.frame(ss,em,sI, row.names=rownames) print(expected) x2 = sum((observeddata -expected)^2/expected) print(x2) #df = (n-1)*(m-1) : # i.e. Totals for each word give MLE: 6 estimates # each book has total words: 3 totals # Totals for each word and totals for each book are the same so # 6+3 - 1 = 8 relations must be satisfied # --> 18 cells - 8 relations = 10 df df = 10 pvalue = 1-pchisq(x2,df) print(pvalue) austen = rowSums(observeddata) imitator = c(83,29,15,22,43,4) combinedData = data.frame(austen, imitator, row.names=rownames) ntotal = sum(combinedData) nullProbs = rowSums(combinedData)/ntotal au = sum(austen) * nullProbs im = sum(imitator) * nullProbs expected = data.frame(au,im,row.names=rownames) print(combinedData) print(expected) x2 = sum((combinedData -expected)^2/expected) print(x2) df = 5 # 2 rows x 6 words: df = 1*5 pvalue = 1-pchisq(x2,df) print(pvalue) # Rice example p. 489 ---- married_once = c(550,681) married_more = c(61,144) rownames = c('college', 'no college') contingency = data.frame(married_once, married_more, row.names=rownames) nmarriedOnce = sum(contingency[,1]) nmarriedMore = sum(contingency[,2]) ntotal = sum(contingency) colProbs = colSums(contingency)/ntotal rowProbs = rowSums(contingency)/ntotal expected = data.frame(rowProbs*colProbs[1]*ntotal, rowProbs*colProbs[2]*ntotal, row.names=rownames) print(contingency) print(expected) x2 = sum((contingency -expected)^2/expected) print(x2) df = 1 # Given the marginal counts as soon as one cell is filled the others are determined pvalue = 1-pchisq(x2,df) print(pvalue) # Rice problem 15 p. 503 ---- rownames = c('males', 'females') maletumors = c(25,10,20,9,17) maletotals = c(100,100,99,100,99) maleNoTumors = maletotals - maletumors femaletumors = c(33,25,32,26,22) femaletotals = c(100,99,99,99,100) femaleNoTumors = femaletotals - femaletumors malenullprob = sum(maletumors)/sum(maletotals) femalenullprob = sum(femaletumors)/sum(femaletotals) maleexpected_tumors = maletotals*malenullprob maleexpected_noTumors = maletotals*(1-malenullprob) femaleexpected_tumors = femaletotals*femalenullprob femaleexpected_noTumors = femaletotals*(1-femalenullprob) malex2 = sum((maletumors -maleexpected_tumors)^2/maleexpected_tumors) + sum((maleNoTumors - maleexpected_noTumors)^2/maleexpected_noTumors) femalex2 = sum((femaletumors -femaleexpected_tumors)^2/femaleexpected_tumors) + sum((femaleNoTumors - femaleexpected_noTumors)^2/femaleexpected_noTumors) print(malex2) print(femalex2) df = 4 # (5-1)(2-1): 10 cells, 5 relations to compute MLE + 2 to compute totals - 1 because grand total is found twice malepvalue = 1-pchisq(malex2,df) print(malepvalue) femalepvalue = 1-pchisq(femalex2,df) print(femalepvalue) # -------??? y = c(78,56,43,53,43,36,42,29) n = c(47,29,29,32,30,22,23,7) print(y+n) nullprob = sum(y)/sum(y+n) expected_y= (y+n)*nullprob expected_n= (y+n)*(1-nullprob) x2 = sum(((expected_y -y)^2)/expected_y) + sum((expected_n-n)^2/expected_n) df = 7 p = 1-pchisq(x2,df) print(p) # rice problem 8 p.501 ---- x = c(165,152,85,67,85) n = c(95,52,52,35,37) nullprob = sum(n)/sum(x) expected = nullprob*x x2 = sum((expected-n)^2/expected) # rice problem 7 p.501 ps = c(8,14,15,3) bio = c(15,19,4,1) other = c(13,15,7,4) data = data.frame(ps,bio,other) ntotal = sum(data) pgrades = rowSums(data)/ntotal majors = colSums(data) pse = pgrades*majors[1] bioe = pgrades*majors[2] oe = pgrades*majors[3] expected = data.frame(pse,bioe,oe) print(data) print(expected) x2 = sum((data-expected)^2/expected) df = 6 p = 1-pchisq(x2,df) print(p) # Slides cured = c(50,30,12) notcured = c(100,80,18) treated = cured + notcured nullprob = sum(cured)/sum(treated) expectedcured = nullprob*treated expected_notcured = treated-expectedcured likRatioStat = 2*sum(cured*log(cured/expectedcured)) + 2*sum(notcured*log(notcured/expected_notcured)) x2 = sum( (cured-expectedcured)^2/expectedcured) + sum( (notcured-expected_notcured)^2/expected_notcured) df = 2 p_likRat = 1-pchisq(likRatioStat,df) p_x2 = 1-pchisq(x2,df) # Sals restaurant ---- observed = c(30,14,34,45,57,20) expected = 200*c(.1,.1,.15,.2,.3,.15) G = 2*sum(observed*log(observed/expected)) x2 = sum( (observed-expected)^2/expected) p = 1 - pchisq(G, 5) # Rice ex. p491 marriage and college ---- observed = c(550,61,681,144) total = sum(observed) p1 = 1231/total p2 = 1 - p1 p3 = 611/total p4 = 1-p3 nullprobs = c(p1*p3, p2*p3, p1*p4, p2*p4) expected = total*nullprobs likRatioStat = 2*sum(observed*log(observed/expected)) x2 = sum( (observed-expected)^2/expected ) df = 1 p = 1 - pchisq(likRatioStat, df)