\documentclass{beamer} \usepackage{amssymb} \usepackage{amsthm} \usepackage{amsmath} \usepackage{amsfonts} \include{definition} \usepackage{graphicx,color} \newcommand{\vecc}[1]{\mbox{\boldmath $#1$}} \newcommand{\Real}{\mbox{\rm I\kern-.23em\hbox{R}}} \def\cvx{\buildrel X \over \longrightarrow} \def\B{\hbox{\fr B}} \font\fr=eufm10 \def\cv{\buildrel d \over \longrightarrow} \def\convindist{\buildrel d \over \longrightarrow} \def\vague{\buildrel v \over \longrightarrow} \def\cvp{\buildrel P \over \longrightarrow} \def\convinprob{\buildrel P \over \longrightarrow} \def\eq{\buildrel \triangle \over =} \def\cvas{\buildrel {\rm a.s.} \over \longrightarrow} \def\convas{\buildrel {\rm a.s.} \over \longrightarrow} \def\cd{\buildrel d \over =} \def\eas{\buildrel {\rm a.s.} \over =} \def\inas{\buildrel {\rm a.s.} \over \subseteq } \def\equalindist{\buildrel d \over =} \newcommand{\D}{\displaystyle} \newcommand{\T}{\textstyle} \newcommand{\cb}{\textcolor{blue}} \newcommand{\cg}{\textcolor{green}} \newcommand{\crd}{\textcolor{red}} \mode<presentation> { \usetheme[hideothersubsections, right, width=.65in]{PaloAlto} % or ... %\usecolortheme{sidebartab} \usecolortheme{default} \setbeamercovered{transparent} % or whatever (possibly just delete it) } \usepackage[english]{babel} % or whatever \usepackage[latin1]{inputenc} % or whatever \usepackage{times} \usepackage[T1]{fontenc} % Or whatever. Note that the encoding and the font should match. If T1 % does not look nice, try deleting the line with the fontenc. \usepackage{amsmath,amssymb} \usepackage{pgf,pgfarrows,pgfnodes} %\includeonlyframes{test} \title[STAT 5166] % (optional, use only with long paper titles) {Statistics in Applications I} %\subtitle %{Presentation Subtitle} % (optional) \author % (optional, use only with lots of authors) {Xufeng Niu} % - Use the \inst{?} command only if the authors have different % affiliation. \institute[Florida State University] % (optional, but mostly needed) { Department of Statistics\\ Florida State University } % - Use the \inst command only if there are several affiliations. % - Keep it simple, no one is interested in your street address. \date[Short Occasion] % (optional) {} %\subject{Talks} % This is only inserted into the PDF information catalog. Can be left % out. % If you have a file called "university-logo-filename.xxx", where xxx % is a graphic format that can be processed by latex or pdflatex, % resp., then you can add a logo as follows: % \pgfdeclareimage[height=0.5cm]{university-logo}{university-logo-filename} % \logo{\pgfuseimage{university-logo}} % Delete this, if you do not want the table of contents to pop up at % the beginning of each subsection: %\AtBeginSubsection[] %{ % \begin{frame}<beamer> % \frametitle{Outline} %\tableofcontents[currentsection] % \tableofcontents[currentsection,currentsubsection] %\end{frame} %} % If you wish to uncover everything in a step-wise fashion, uncomment % the following command: %\beamerdefaultoverlayspecification{<+->} \begin{document} %\begin{frame} % \titlepage %\end{frame} \begin{frame} \frametitle{Chapter 3 Outline} \tableofcontents % You might wish to add the option [pausesections] \end{frame} \section{Comparing Two Entities} \subsection[An Industrial Example]{An Industrial Example} \begin{frame} \frametitle{An Industrial Example} \noindent \textcolor{red}{Experiment}: Conducted by a chemical manufacturing plant. Used the standard producing method (A) to make \cb{in sequence} 10 batches of the chemical, followed by 10 batches using a modified method (B). \vspace{0.1in} \noindent \textcolor{red}{Observations (Chemical Yields)}: \begin{itemize} \item $y_{A1}, y_{A2}, \ldots, y_{A,10},$ \hspace{0.1in} $\bar{y}_A = 84.24;$ \item $y_{B1}, y_{B2}, \ldots, y_{B,10},$ \hspace{0.1in} $\bar{y}_B = 85.54.$ \end{itemize} \vspace{0.1in} \noindent \textcolor{red}{Question}: Can we claim that Method B is better than Method A based on $\bar{y}_B -\bar{y}_A = 1.30$? \vspace{0.1in} \noindent \textcolor{red}{Statistical Test}: Assume that $Y_A \sim N(\mu_A, \sigma^2)$ and $Y_B \sim N(\mu_B, \sigma^2)$. Test \cg{$H_0: \mu_B=\mu_A$} vs \cg{$H_a: \mu_B>\mu_A$.} \end{frame} \begin{frame} \frametitle{An Industrial Example} \noindent \textcolor{red}{Test Strategy I}: Use an external reference data to construct a reference distribution and test the hypothesis \cb{(210 past batches produced by Method A)}. \begin{itemize} \item Advantage: Do not need independence assumption for past data set or current data set. \item Disadvantage: Need a fairly extensive past data set to construct the reference distribution. \end{itemize} \vspace{0.1in} \crd{Test Procedure:} \begin{itemize} \item Based on the past data set, calculate moving averages of 10 consecutive observations (201 moving-averages): \[ y_1-y_{10}, y_{2} - y_{11}, y_{3} - y_{12}, \ldots, \] \item Calculate the differences between averages of two adjacent groups of 10 consecutive batches: \[ \bar{y}_{(11-20)} - \bar{y}_{(1-10)}, \bar{y}_{(12-21)} - \bar{y}_{(2-11)}, \ldots, \] \end{itemize} \end{frame} \begin{frame} \frametitle{An Industrial Example} \cb{191 differences of moving averages:} \[ \bar{y}_{(11-20)} - \bar{y}_{(1-10)}, \bar{y}_{(12-21)} - \bar{y}_{(2-11)}, \ldots, \] \vspace{0.1in} Draw the histogram of the 191 differences, which is the reference \crd{(empirical)} distribution. \vspace{0.1in} If $H_0: \mu_A=\mu_B$ is true, the difference of means from the current samples, $\bar{y}_B -\bar{y}_A = 1.30$, and the 191 differences are from the same distribution. \crd{If $\bar{y}_B -\bar{y}_A = 1.30$ is a rarely occurred event among the 191 differences, the null hypothesis is probably not valid.} \end{frame} \begin{frame} \frametitle{An Industrial Example} In this example, \crd{ 9 out of the 191 differences are as great as 1.30.} Thus the $p$-value for the test based on the reference distribution is $9/191 =0.047$. So we reject the null hypothesis and conclude that Method B is better than Method A, even though the evidence against the null hypothesis is not strong \crd{(a borderline case)}. \vspace{0.1in} \noindent \textcolor{red}{Test Strategy II}: Student-$t$ test based on the current samples. \vspace{0.1in} \cg{Basic Assumptions:} \begin{itemize} \item The two populations (Yields from Methods A and B) are both (approximately) normally distributed. \item The two samples are random samples: \crd{observations in each sample are iid, and the two samples are independent.} \end{itemize} \end{frame} \begin{frame} \frametitle{An Industrial Example} \crd{If the two basic assumptions are valid}, we have \[\bar{Y}_B - \bar{Y}_A \sim N\left(\mu_B - \mu_A, \sigma^2\left(\frac{1}{n_A}+\frac{1}{n_B}\right)\right).\] \vspace{0.1in} Under the null hypothesis $H_0: \mu_B=\mu_A$, the statistic \[ T =\frac{\bar{Y}_B - \bar{Y}_A}{s\sqrt{\frac{1}{n_A}+\frac{1}{n_B}}}\sim t_{n_A+n_B-2},\] where $s^2$ is the pooled estimate of $\sigma^2$ with the form: \[ s^2 =\frac{\sum_{i=1}^{n_A}(y_{A_i}-\bar{y}_A)^2 + \sum_{i=1}^{n_B}(y_{B_i}-\bar{y}_B)^2}{n_A+n_B-2}.\] \end{frame} \begin{frame} \frametitle{An Industrial Example} In this example, \begin{eqnarray*} s^2 = 10.87, & s=3.30, & s\sqrt{\frac{1}{n_A}+\frac{1}{n_B}} = 1.476, \end{eqnarray*} \[T_0=1.3/1.476 =0.88.\] The $p$-value for the test is \[ P(t_{18} > T_0) = P(t_{18}>0.88) =0.195.\] \vspace{0.1in} \crd{Conclusion:} No enough evidence to reject the null hypothesis (\cb{In contradiction with the test result based on the reference distribution}). \end{frame} \begin{frame} \frametitle{An Industrial Example} \noindent \textcolor{red}{Test Strategy III}: Non-parametric tests based on the current samples. \vspace{0.1in} Non-parametric tests are also called ``distribution-free" tests, which greatly relax distribution restrictions, such as normal distribution, and are widely used in many fields. But independence of observations is still needed in non-parametric tests. Independence is actually a property of the joint distribution. \vspace{0.1in} \cg{Basic Assumption for non-parametric two-sample tests:} \begin{itemize} \item The two samples are random samples: \crd{observations in each sample are iid, and the two samples are independent.} \end{itemize} \end{frame} \begin{frame} \frametitle{An Industrial Example} \noindent \crd{Wilcoxon Rank Sum Test}. \vspace{0.1in} Frank Wilcoxon (2 Sept 1892 - 18 Nov 1965) \vspace{0.1in} After a WW1 job with the Atlas Powder Company in Michigan, Wilcoxon entered Rutgers in 1920, and completed an MS in chemistry in 1921; he then shifted to Cornell and physical chemistry, and got his (PhD in 1924. His first position after graduation was a postdoctoral fellowship at the Boyce Thompson Institute for Plant Research in Yonkers, where he was assigned to investigate the use of copper compounds as fungicides. At the Institute, Wilcoxon, Jack Youden, and biologist F.E. Denny led a group in studying Fisher's newly issued Statistical Methods for Research Workers (1ed 1925). Wilcoxon and Youden both went on to influential careers in statistics. \end{frame} \begin{frame} \frametitle{An Industrial Example} Wilcoxon had taught physical chemistry part-time at Brooklyn Polytechnic from 1929 to 1941, when war work ended that dual career. In 1960, after his retirement from industry, he was persuaded by \cb{Ralph Bradley} to rejoin academe by accepting a half-time Distinguished Lectureship in the new Department of Statistics at Florida State in Tallahassee, which Bradley had just founded. \vspace{0.1in} Wilcoxon, who with his wife was an enthusiastic bicyclist, shifted to a motorcycle for part of his time at Florida. He was a strict teacher. Perhaps bearing in mind the periods in his life when a mistake would destroy the town he was working in, he insisted that the answer not only had to be theoretically right, but also computationally correct. \end{frame} \begin{frame} \frametitle{An Industrial Example} \noindent \textcolor{red}{Wilcoxon Rank Sum Test}: Assume that the two sampled distributions are continuous. \begin{itemize} \item Rank the combined samples from the smallest to the largest. Ties are scored with average rank; \item Calculate the rank sum for each sample; In this case, $T_A = 95.5$ and $T_B=114.5$. \[ T_A + T_B = \frac{(n_A+n_B)(n_A+n_B+1)}{2} =210.\] \item $H_0$: The two sampled populations have identical distributions, $H_a$: The distribution of $Y_B$ is shifted to the right of the distribution of $Y_A$ (One-sided test). \end{itemize} \end{frame} \begin{frame} \frametitle{An Industrial Example} Under the null hypothesis, when $n_A \geq 10$ and $n_B\geq 10$, we have \[ Z= \frac{T_B -n_B(n_A+n_B+1)/2}{\sqrt{n_An_B(n_A+n_B+1)/12)}} \sim N(0,1)\] approximately. \vspace{0.1in} For this example, the observed statistic value is $z_0 =0.718$. The $p$-value for this test is \[p = P(Z>0.718) = 0.236 > 0.05.\] \vspace{0.1in} \crd{Conclusion:} No enough evidence to reject the null hypothesis (\cb{Again, in contradiction with the test result based on the reference distribution}). \end{frame} \subsection[A Simulation Study]{A Simulation Study} \begin{frame} \frametitle{A Simulation Study} \cb{Independence of samples is essential in hypothesis testing.} \vspace{0.1in} Type I and Type II errors in tests of hypothesis: \begin{itemize} \item Type I Error: $H_0$ is correct but rejected \crd{(False Positive)}, \[\alpha = P(\mbox{Type I Error});\] \item Type II Error: $H_0$ is false but accepted \crd{(False Negative)}, \[\beta = P(\mbox{Type II Error});\] \item Power of a test: the probability of $H_0$ is false and rejected, \[\mbox{Power} = 1-\beta.\] \end{itemize} \end{frame} \begin{frame} \frametitle{A Simulation Study} \cb{Independence of samples is essential in hypothesis testing.} \vspace{0.1in} \cb{Simulation Set-Up:} \begin{itemize} \item Four types of distributions: Uniform, Normal, Skewed, Contaminated Normal; \item Three Autocorrelations: $\rho=0,$ $\rho=-0.4$, and $\rho=0.4$; \item Two ways: without randomization and with randomization. \end{itemize} \vspace{0.1in} \cb{Simulation Steps:} \begin{itemize} \item Draw two samples of 10 observations each from identical distribution; \item Conduct a student-$t$ test and a Wilcoxon test for the difference of the means based on the two samples. $H_0: \mu_1=\mu_2$ vs $H_a: \mu_1 \neq \mu_2$. Reject the null hypothesis using the significance level $\alpha=0.05$. \end{itemize} \end{frame} \begin{frame} \frametitle{A Simulation Study} \cb{Repeat the two steps 20,000 times and count how many times the null hypothesis is rejected.} \vspace{0.1in} \crd{ 1). How to generate samples with a given autocorrelation?} \vspace{0.1in} Consider the MA(1) model \[ y_i = \epsilon_i + \theta \epsilon_{i-1}.\] Assume that $\{\epsilon_i\}$ are iid. The autocorrelation function (ACF) of $\{Y_i\}$ is \begin{eqnarray*} \rho_0 =1, & \rho_1 = \frac{\theta}{1+\theta^2}, & \rho_k = 0 \hspace{0.1in} \mbox{for} \hspace{0.1in} k>1. \end{eqnarray*} e.g., $\frac{\theta}{1+\theta^2}=-0.4$ implies $\theta = -0.5$. \end{frame} \begin{frame} \frametitle{A Simulation Study} \begin{itemize} \item Get a random sample $\{\epsilon_i, i=1, \ldots, 11\}$ from a given distribution, \item Generate $\{y_i, i=2, \ldots, 11\}$ based on the model. \end{itemize} Example: \cb{n1<-0} \cb{for (k in 1:20000)\{} \cb{x1<-rnorm(11,0,1); \hspace{0.1in} x2<-rnorm(11,0,1)} \cb{y1<-rep(0,10); \hspace{0.1in} y2<-rep(0,10)} \cb{for(i in 2:11)\{} \cb{ y1[i-1]<-x1[i]-0.5*x1[i-1]} \cb{ y2[i-1]<-x2[i]-0.5*x2[i-1] \}} \cb{t1<-t.test(y1,y2); \hspace{0.2in} p1<-t1\$p.value} \cb{if (p1<0.05) n1<-n1+1} \cb{\}} \end{frame} \begin{frame} \frametitle{A Simulation Study} \crd{ 2). How to randomize the samples?} \begin{itemize} \item Generate $\{y_{1i}, i=1,\ldots, 10\}$ and $\{y_{2i}, i=1,\ldots, 10\}$; \item Combine the samples and randomly permute the combined sample: \vspace{0.2in} \cb{y<-c(y1,y2)} \cb{y0<-sample(y)} \cb{y1<- y[1:10]; \hspace{0.1in} y2<-y[11:20]} \end{itemize} \end{frame} \begin{frame} \frametitle{A Simulation Study} \crd{ 3). Conclusions from the simulation study:} \begin{itemize} \item About 5\% rejection rate is expected since $\alpha=0.05$ is used as the significance level; \item Negative autocorrelation ($\rho=-0.4$) made the null hypothesis harder to be rejected than it should be \cb{(rejection rates are less than 5\%)}, while positive autocorrelation ($\rho=0.4$) made the null hypothesis easier to be rejected than it should be \cb{(rejection rates are larger than 5\%)}; \item The parametric student-$t$ test performed as well as the non-parametric Wilcoxon Rank Sum test in all cases. \end{itemize} \end{frame} \begin{frame} \frametitle{A Simulation Study} \crd{ Why both student-$t$ and Wilcoxon Rank Sum tests performed poorly when data are autocorrelated?} \vspace{0.1in} Consider a simple example: one-sample test about a mean. \vspace{0.1in} When $\{X_1, \ldots, X_n\}$ are autocorrelated with $\rho_1=\rho$ and $\rho_k=0$ for $k>1$, we have \[ \mbox{Var}(\bar{X}) = \frac{n*\sigma^2 + 2(n-1)\rho}{n^2}.\] \vspace{0.1in} The student-$t$ statistic we used is $T=\frac{\bar{X}}{s/\sqrt{n}}$. When the data are negatively autocorrelated, $\frac{s^2}{n}$ over-estimates the variance of $\bar{X}$ and makes the absolute $t$-value smaller than it should be. Thus \cb{harder} to reject the null hypothesis. \end{frame} \begin{frame} \frametitle{A Simulation Study} Similarly, when the data in a sample are negatively autocorrelated, the observations in the sample usually show an up-down pattern \cb{(one up, next one down)}. The Wilcoxon ranks for the two samples are more ``uniformly mixed" than they are should be. Thus \cb{harder} to reject the null hypothesis of the two samples are from the same distribution. \vspace{0.1in} On the other hand, when the data in each sample are positively correlated, the values in one sample tend to be larger that the values in another sample. Therefore one sample gets uniformly higher ranks, making the null hypothesis easier to be rejected than it should be. \end{frame} \subsection[Randomization and Blocking]{Randomization and Blocking} \begin{frame} \frametitle{Randomization and Blocking} \cb{Randomization and blocking are two main principles in experimental designs} \vspace{0.1in} \crd{Randomization to generate independent samples}: \begin{itemize} \item To guarantee valid inferences based on the sample, such as estimation and testing hypotheses. \vspace{0.1in} Examples: When $\{X_1, \ldots, X_n\}$ are not independent, the variance of $\bar{X}$ is not $\sigma^2/n$, $\bar{X}$ and $s^2$ may not be independent, as a consequence, the student-$t$ test is no longer valid. \item To eliminate possible biases that arise through systematic assignment of treatments to experimental units. \end{itemize} \end{frame} \begin{frame} \frametitle{Randomization and Blocking} Example. Expert Beer tasting: 10 experts taste brands A and B and give scores. If Brand A is always tasted before Brand B, possible systematic biases will affect the comparison results. \vspace{0.1in} Randomization: randomly choose A or B to taste first by each expert. \end{frame} \begin{frame} \frametitle{Randomization and Blocking} \crd{Blocking to increase precision} \vspace{0.1in} Blocking in an experiment stratifies experimental units into homogeneous groups. Hence avoid confounding of effects from different sources and reduce the variance of the experimental errors. Greater precision is usually obtained because differences associated between blocks are eliminated. \vspace{0.1in} Examples of blocking: \begin{itemize} \item In a clinical trial, patients are divided in to groups based on age, gender, and other physical conditions for a better understanding of the effects of a new drug. \item In a social study, subjects are stratified into groups based on educational levels, countries, races. \item In a biological or chemical study, samples are analyzed in different time and by different technicians. \end{itemize} \end{frame} \subsection[The Tomato Plant Example]{The Tomato Plant Example} \begin{frame} \frametitle{The Tomato Plant Example} \cb{An experiment conducted by a gardener.} \begin{itemize} \item Experimental Units: 11 tomato plants in a line. \item Purpose of the experiment: compare fertilizer mixtures A and B. \item Randomization of the experiment: treatments (A or B) were assigned to the 11 plants by randomly taking 11 cards, 5 marked A and 6 marked B. \item Response: tomato yields in pounds (Table 3.3) \item Results: $\bar{y}_A = 20.84$, $\bar{y}_B = 22.53$, and $\bar{y}_B-\bar{y}_A =1.69$, \[ s^2 = \frac{\sum_{i=1}^{n_A}(y_{A_i}-\bar{y}_A)^2 + \sum_{i=1}^{n_B}(y_{B_i}-\bar{y}_B)^2}{n_A+n_B-2} = 39.73.\] \end{itemize} \end{frame} \begin{frame} \frametitle{The Tomato Plant Example} \begin{tabular}{ll} Test the hypotheses. & $H_0: \mu_A = \mu_B$ \\ & $H_a: \mu_B > \mu_A$ \end{tabular} \vspace{0.1in} \cb{1) Student-$t$ test. } Under the null hypothesis, \[ T = \frac{\bar{Y}_B - \bar{Y}_A}{s\sqrt{\frac{1}{n_A}+\frac{1}{n_B}}}\sim t_{n_A+n_B-2}. \] Here $n_A=5$, $n_B=6$, and $df=5+6-2 =9$. \vspace{0.1in} The observed $T$-value is $T_0 = \frac{1.69}{\sqrt{39.73(1/5+1/6)}}=0.44$ and the $p$-value for the test is $P(t_9>0.44) \approx 0.34$. Hence we do not reject the null hypothesis. \end{frame} \begin{frame} \frametitle{The Tomato Plant Example} \cb{2) Test based on a randomization distribution.} A randomization distribution is constructed from the values of difference means for all possible arrangement of treatments on the experimental units. \vspace{0.1in} Under the null hypothesis, yield of fertilizer $B$ is not different from that of fertilizer $A$, so it should make no difference how we allocate the 5 $A$'s and 6 $B$'s to the 11 plants. \vspace{0.1in} \crd{There are $\frac{11!}{5!6!}=462$ possible permutations of the 5 $A$'s and 6 $B$'s.} In Table 3.3, fix the first row and the third row, permute $A$'s and $B$'s in the second row. For each permutation, calculate the difference $\bar{Y}_B - \bar{Y}_A$. \cg{Total 462 such differences can be calculated.} \end{frame} \begin{frame} \frametitle{The Tomato Plant Example} The randomization distribution of difference means based on the 462 differences are plotted in Figure 3.6. There were 154 differences are at least 1.69. So $p$-value for the test is $p=154/462 =0.33$. \cb{Again, we do not reject the null hypothesis based on this test.} \vspace{0.1in} Sr.\ Fisher argued that randomization distribution in this experiment would make it possible to conduct a valid significance test without making any other assumptions about the distribution of the yield. \crd{But it is not always possible to construct the randomization distribution of a statistic. Actually, even for this small experiment, it is not easy to find the 462 distinct permutations.} \end{frame} \begin{frame} \frametitle{The Tomato Plant Example} For this example, we may perform the test in Splus: \cb{n1<-0} \cb{ y1<-c(29.2, 11.4, 26.6, 23.7, 25.3, 28.5, 14.2, 17.9, 16.5, 21.1, 24.3)} \cb{ c1<-c(rep("A",5), rep("B",6)) } \cb{for (k in 1:10000)\{} \cb{c2<-sample(c1)} \cb{x1<-y1[c2=="A"]; \hspace{0.1in} x2<-y1[c2=="B"]} \cb{m1<-mean(x1); \hspace{0.1in} m2<-mean(x2)} \cb{ d1<-m2-m1} \cb{if (d1>=1.69) n1<-n1+1} \cb{\}} \end{frame} \subsection[The Boys' Shoe Example]{The Boys' Shoe Example} \begin{frame} \frametitle{The Boys' Shoe Example} \cb{This is a special case of Randomized block designs, with block size 2, also called a paired experiment.} \vspace{0.1in} \crd{Design of the experiment}: \begin{itemize} \item Purpose: Compare two types of materials A and B for making shoe soles. \item Treatments: materials A and B. \item Experiment units: 10 boys. \item Randomization: for each pair of shoes, left or right sole made with A or B, decided by flipping a coin. \item Response: wear degree of the shoes. \end{itemize} \end{frame} \begin{frame} \frametitle{The Boys' Shoe Example} \cb{This is a special case of Randomized block designs, with block size 2, also called a paired difference experiment.} \vspace{0.1in} \crd{Design of the experiment}: \begin{itemize} \item Purpose: Compare two types of materials A and B for making shoe soles. \item Treatments: materials A and B. \item Experiment units: 10 boys. \item Randomization: for each pair of shoes, left or right sole made with A or B, decided by flipping a coin. \item Response $y$: wear degree of the shoes. \end{itemize} \end{frame} \begin{frame} \frametitle{The Boys' Shoe Example} \cb{Observations: $(y_{A1}, y_{B1}), (y_{A2}, y_{B2}), \ldots, (y_{A,10}, y_{B,10})$} \vspace{0.1in} The ten pairs are independent. But the two observations in each pair are usually positively correlated. \vspace{0.1in} \crd{Other examples:} \begin{itemize} \item Selling houses: asking prices and sell prices; \item Test scores: before and after a training period; \item Gasoline prices: Choose two stations in the same city; \item Starting salaries: Choose a male and a female from the same department with same GPA. \end{itemize} \end{frame} \begin{frame} \frametitle{The Boys' Shoe Example} \begin{tabular}{ll} Test the hypotheses. & $H_0: \mu_d= \mu_B -\mu_A =0$ \\ & $H_a: \mu_d=\mu_B -\mu_A >0$ \end{tabular} \vspace{0.1in} \cb{Working on the differences} \[d_1=y_{B1}-y_{A1}, d_2=y_{B2}-y_{A2} \ldots, d_{10}=y_{B,10}-y_{A,10}\] \cb{instead of the the 10 pairs.} The 10 differences are independent! If the original observations are from normal distributions, the 10 differences are also from a normal distribution with mean $\mu_d$ and variance $\sigma^2_d$. The distribution of $\bar{d}$ is $N(\mu_d, \sigma_d^2/n)$. \end{frame} \begin{frame} \frametitle{The Boys' Shoe Example} Under the null hypothesis, test Statistic \[T = \frac{\bar{d}}{s_d/\sqrt{n}}\] has a student-$t$ distribution with $df=n-1$, where $n$ is the number of pairs and $s_d^2$ is the sample variances based on the 10 differences. \vspace{0.1in} For this example, we have \begin{itemize} \item $\bar{d} = 0.41$, \hspace{0.1in} $s_d^2 = \frac{1}{9}\sum_{i=1}^{10} (d_i-\bar{d})^2 = 0.149$; \item $s_d=0.386$, \hspace{0.1in} $T_0 = \frac{\bar{d}}{s_d/\sqrt{10}} = \frac{0.41}{0.12}=3.4$; \item The $p$-value for this test is $p=P(t_9>3.4) \approx 0.004$; \item Conclusion: Reject the null hypothesis using $\alpha=0.05$. \end{itemize} \end{frame} \section{Inference about Variances} \subsection[One Variance]{One Variance} \begin{frame} \frametitle{One Variance} \cb{1). Inference about variance of one population, normally distributed data.} \vspace{0.1in} \crd{One sample}: \begin{itemize} \item $\{y_1, y_2, \ldots, y_n\}$ is a random sample (iid) from $N(\mu, \sigma^2)$; \item If $\mu$ and $\sigma^2$ are both known, \[\frac{\sum_{i=1}^n(y_i-\mu)^2}{\sigma^2} \sim \chi^2_n;\] \item If only $\sigma^2$ is known, \[\frac{\sum_{i=1}^n(y_i-\bar{y}_i)^2}{\sigma^2} \sim \chi^2_{n-1},\] \[\frac{(n-1)s^2}{\sigma^2}=\frac{\sum_{i=1}^n(y_i-\bar{y}_i)^2}{\sigma^2} \sim \chi^2_{n-1}.\] \end{itemize} \end{frame} \begin{frame} \frametitle{One Variance} \begin{tabular}{ll} Test the hypotheses. & $H_0: \sigma^2 = \sigma_0^2$ ($\sigma^2_0$ is a known value),\\ & $H_a: \sigma^2 \neq \sigma_0^2$ (or $\sigma^2 >\sigma_0^2$ or $\sigma^2 <\sigma_0^2$). \end{tabular} \vspace{0.1in} \begin{itemize} \item Test Statistic: $T=\frac{(n-1)s^2}{\sigma^2_0}$, \item Under the null hypothesis, $T \sim \chi^2_{n-1}$, \item Suppose that $s^2>\sigma_0^2$. The $p$-value for the two-sided test is $p = 2\times P(\chi^2_{n-1}>T_0),$ where $T_0$ is the observed value of the statistic. \item For a give significance level $\alpha>0$, the $100(1-\alpha)\%$ confidence interval for $\sigma^2$ is \[ \frac{(n-1)s^2}{B} \leq \sigma^2 \leq \frac{(n-1)s^2}{A},\] where \[P(\chi^2_{n-1} > B ) = \alpha/2, \hspace{0.1in} P(\chi^2_{n-1} > A ) = 1-\alpha/2.\] \end{itemize} \end{frame} \subsection[Ratio of Variances]{Ratio of Variances} \begin{frame} \frametitle{Ratio of Variances} \cb{2). Inference about the variances of two populations, normally distributed data.} \vspace{0.1in} Two samples: \begin{eqnarray*} x_1, x_2, \ldots, x_{n1}, & iid, & N(\mu_1, \sigma_1^2), \\ y_1, y_2, \ldots, y_{n2}, & iid, & N(\mu_2, \sigma_2^2). \end{eqnarray*} The two samples are assumed to be independent too. \vspace{0.1in} \begin{tabular}{ll} Test the hypotheses. & $H_0: \sigma_1^2 = \sigma_2^2$ (or $\frac{\sigma_1^2}{\sigma_2^2}=1$),\\ & $H_a: \sigma_1^2 \neq \sigma_2^2$ (or $\sigma_1^2 >\sigma_2^2$ or $\sigma_1^2 <\sigma_2^2$). \end{tabular} \vspace{0.1in} \begin{tabular}{lll} Distributions: & $\frac{v_1 s_1^2}{\sigma_1^2} \sim \chi^2_{v_1}$, & $v_1=n_1-1$,\\ & $\frac{v_2 s_2^2}{\sigma_2^2} \sim \chi^2_{v_2}$, & $v_2=n_2-1$.\\ \end{tabular} \end{frame} \begin{frame} \frametitle{Ratio of Variances} \cb{$\frac{v_1 s_1^2}{\sigma_1^2}$ and $\frac{v_2 s_2^2}{\sigma_2^2}$ are independent. Thus, $F^\ast = \frac{s_1^2/\sigma_1^2}{s^2_2/\sigma^2_2}$ has an $F$ distribution with $df_1=v_1$ and $df_2=v_2$. But $F^\ast$ is not a statistic.} \vspace{0.2in} Under the null hypothesis, the statistic $F=\frac{s_1^2}{s_2^2}$ has an $F$ distribution with $df_1=v_1$ and $df_2=v_2$. \vspace{0.2in} For inference of two variances, usually a one-sided test is performed. If $s_1^2 > s_2^2$, the $p$-value is $P(F_{v_1, v_2} > F_0)$. If $s_1^2 < s_2^2$, the $p$-value is $P(F_{v_1, v_2} < F_0)$. \end{frame} \begin{frame} \frametitle{Ratio of Variances} \cb{3). Convert a test on variance to a test on means.} \vspace{0.1in} \crd{Distribution Theory:} \begin{itemize} \item A random sample $\{X_1, X_2, \ldots, X_{n}\}$, iid, $N(\mu, \sigma^2)$, \item Sample variance $s^2 \sim \frac{\sigma^2}{n-1}\chi^2_{n-1}$, \item There exists an orthogonal transformation matrix $A$, $\vec{Y} = A\vec{X}$ such that \[ \{Y_2, Y_3, \ldots, Y_n\}, \mbox{iid} \hspace{0.1in} \sum_{i=1}^n(X_i-\bar{X})^2 = \sum_{i=2}^nY_i^2.\] \item Thus \[s^2 -\sigma^2 = \frac{1}{n-1}\sum_{i=2}^n(Y_i^2-\sigma^2) \eq \frac{1}{n-1}\sum_{i=2}^n w_i,\] where $\{w_i = Y_i^2-\sigma^2, i=2, \ldots, n\}$ are iid with mean 0 and variance $2\sigma^4$. \end{itemize} \end{frame} \begin{frame} \frametitle{Ratio of Variances} By the CLT, \[(*).\hspace{0.1in} \sqrt{n-1}(s^2-\sigma^2) = \frac{1}{\sqrt{n-1}}\sum_{i=2}^n w_i \Rightarrow N(0,2\sigma^4).\] \vspace{0.1in} \crd{Theorem (The Delta-Method).} If $\{X_n\}$ is a sequence of random variables for which $\sqrt{n}(X_n-\mu) \Rightarrow N(0, \sigma^2)$, then for any function $f(\cdot)$ for which $f'(\mu)$ exists and $f'(\mu) \neq 0$, \[ \sqrt{n}[f(X_n)-f(\mu)] \Rightarrow N(0, \sigma^2[f'(\mu)]^2).\] \vspace{0.1in} By (*) and the Delta-Method, we have \[(**). \hspace{0.2in} \sqrt{n-1}(\log(s^2)-\log(\sigma^2)) \Rightarrow N(0,2).\] \end{frame} \begin{frame} \frametitle{Ratio of Variances} \begin{tabular}{ll} Test the hypotheses. & $H_0: \sigma_1^2 = \sigma_2^2$ (or $\frac{\sigma_1^2}{\sigma_2^2}=1$),\\ & $H_a: \sigma_1^2 \neq \sigma_2^2$ (or $\sigma_1^2 >\sigma_2^2$ or $\sigma_1^2 <\sigma_2^2$). \end{tabular} \vspace{0.2in} Suppose that sample variances $\{(s_{11}^2, s_{21}^2), \ldots, (s_{1m}^2, s_{2m}^2)\}$ are available from a paired difference experiment, repeated $m$ times, we have \[(\log(s_{1i}^2)-\log(\sigma_1^2) \approx N(0, 2/(n-1)), \hspace{0.1in} i=1, \ldots, m,\] \[(\log(s_{2i}^2)-\log(\sigma_2^2) \approx N(0, 2/(n-1)), \hspace{0.1in} i=1, \ldots, m.\] Thus under the null hypothesis, $ \{d_i = \log(s_{1i}^2)-\log(s_{2}^2), i=1,\ldots, m\}$ are iid and approximately normally distributed with mean zero and variance $\sigma^2$. \vspace{0.1in} \end{frame} \begin{frame} Statistic $T= \frac{\bar{d}}{s_d/\sqrt{n}}$ has a student-$t$ distribution with $df=n-1$. \vspace{0.2in} \crd{The Analysts example on Page 105} \begin{itemize} \item In each of the five weeks, five identical samples were analyzed by two Analysts, \item $\bar{d} = 0.352$, $s_d = 0.226$, and $s_{\bar{d}} = s_d/\sqrt{5} =0.101$, \item The $t$-value is about 3.5, and the $p$-value is about 0.012 for the one-sided test $H_a: \sigma_1^2>\sigma_2^2$, \item Conclusion: Analyst 2 is more precise, i.e., with smaller variance in his/her test results. \end{itemize} \end{frame} \section{Inference about Proportions} \subsection[One-Sample Binomial]{One-Sample Binomial} \begin{frame} \frametitle{One-Sample Binomial} \cb{1). Binomial Distribution} \vspace{0.1in} \crd{Randomization to generate independent samples}: \begin{itemize} \item $n$ independent Bernoulli Trials: \[X_i = \left\{\begin{array}{ll} 1, & \mbox{If a success in the $i$th trial},\\ 0, & \mbox{If a failure in the $i$th trial}. \end{array} \right.\hspace{0.1in} i=1, \ldots, n.\] \item $P(X_i=1) =p$, and $P(X_i=0) = 1-p =q.$\ \item $X=\sum_{i=1}^n X_i$ = the number of successes in the $n$ independent trials, \[ P(X=k) = \left(n \atop k\right) p^k(1-p)^{n-k}, \hspace{0.1in} k=0, 1, \ldots, n.\] \item E($X) = np$, and Var$(X) = np(1-p).$ \end{itemize} \end{frame} \begin{frame} \frametitle{One-Sample Binomial} Estimate for proportion $p$: \[ \hat{p} = \frac{X}{n} = \frac{1}{n}\sum_{i=1}^n X_i,\] \begin{itemize} \item E($\hat{p}) = p$, and Var$(\hat{p}) = \frac{p(1-p)}{n}.$ \item If $n$ is large ($0\leq np\pm 3\sqrt{np(1-p)} \leq n$), by the CLT, \[ \hat{p} \sim AN\left(p, \frac{p(1-p)}{n}\right)\] or \[ X =\sum_{i=1}^n X_i \sim AN(np, np(1-p)).\] \end{itemize} \end{frame} \begin{frame} \frametitle{One-Sample Binomial} \begin{tabular}{ll} Test the hypotheses. & $H_0: p= p_0$ \\ & $H_a: p\neq p_0 (p>p_0, \mbox{or} \hspace{0.05in} p<p_0) $ \end{tabular} \vspace{0.1in} For large sample size (normal approximation), test statistic \[Z=\frac{\hat{p}-p_0}{\sqrt{p_0(1-p_0)/n}} \sim AN(0, 1) \] or \[Z=\frac{n\hat{p}-np_0}{\sqrt{np_0(1-p_0)}} \sim AN(0, 1), \] or \[T=Z^2=\frac{(X-np_0)^2}{np_0(1-p_0)} \sim \chi^2_1 \] \end{frame} \begin{frame} \frametitle{One-Sample Binomial} Notice that \begin{eqnarray*} T&=& \frac{(X-np_0)^2}{np_0(1-p_0)} = \frac{(1-p_0)(X-np_0)^2}{np_0(1-p_0)} +\frac{p_0(X-np_0)^2}{np_0(1-p_0)}\\[0.05in] & = &\frac{(X-np_0)^2}{np_0}+\frac{[(n-X)-n(1-p_0)]^2}{n(1-p_0)}\\[0.05in] & = & \sum_{j=1}^2 \frac{(O_j-e_j)^2}{e_j} \sim \chi^2_1, \end{eqnarray*} where \[O_1 = X, \hspace{0.05in} O_2=n-X, \hspace{0.05in} e_1=np_0, \hspace{0.05in}e_2=n(1-p_0).\] \end{frame} \subsection[r-Sample Binomial]{r-Sample Binomial} \begin{frame} \frametitle{r-Sample Binomial} \begin{itemize} \item $X_i \sim Binomial(n_i, p_i), i=1, \ldots, r, n_i\geq 10$, \item $X_1, \ldots, X_r$ are independent. \end{itemize} \vspace{0.1in} \crd{1). Testing the proportion for each population.} \begin{tabular}{ll} & $H_0: p_i= p_{i0}$ \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.2in} \cb{Observations:} \[(O_{11}, O_{12}), (O_{21}, O_{22}), \ldots, (O_{r1}, O_{r2}),\] where $O_{i2} = n_i - O_{i1}$. \end{frame} \begin{frame} \frametitle{r-Sample Binomial} \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{i=1}^r \sum_{j=1}^2 \frac{(O_{ij}-e_{ij})^2}{e_{ij}} \sim \chi^2_r, \hspace{0.05in} \crd{e_{ij} \geq 5}, \end{eqnarray*} where under the null hypothesis $H_0$, \[p_{i1}=p_{i0}, \hspace{0.05in} p_{i2}=1-p_{i0}, \hspace{0.05in} e_{ij} = n_i\times p_{ij}.\] \vspace{0.1in} \cb{Additive Distribution Families ($X$ and $Y$ are independent)}: \begin{itemize} \item $X \sim \chi^2_n$ and $Y \sim \chi^2_m$, then $X+Y\sim \chi^2_{n+m}$. \item $X \sim Pois(\lambda_1)$ and $Y \sim Pois(\lambda_2)$, then $X+Y\sim Pois(\lambda_1+\lambda_2)$. \end{itemize} \end{frame} \begin{frame} \frametitle{r-Sample Binomial} \cb{Additive Distribution Families ($X$ and $Y$ are independent)}: \begin{itemize} \item $X \sim N(\mu_1, \sigma_1^2)$ and $Y \sim N(\mu_2, \sigma_2^2)$, then $X+Y\sim N(\mu_1+\mu_2, \sigma_1^2+\sigma_2^2)$. \item $X \sim Binom(n_1,p)$ and $Y \sim Binom(n_2, p)$, then $X+Y\sim Binom(n_1+n_2, p)$. \end{itemize} \vspace{0.2in} \crd{2). Testing the r-samples are from the same distribution.} \begin{tabular}{ll} & $H_0: p_i \equiv p,\hspace{0.1in} i=1, \ldots, r$, \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.1in} Estimate for the common $p$: \[\hat{p} = \frac{\sum_{i=1}^r O_{i1}}{\sum_{i=1}^r n_i}.\] \end{frame} \begin{frame} \frametitle{r-Sample Binomial} \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{i=1}^r \sum_{j=1}^2 \frac{(O_{ij}-\hat{e}_{ij})^2}{\hat{e}_{ij}} \sim \chi^2_{r-1}, \hspace{0.05in} \crd{\hat{e}_{ij}\geq 5}, \end{eqnarray*} where under the null hypothesis $H_0$, \[\hat{p}_{i1}=\hat{p}, \hspace{0.05in} \hat{p}_{i2}=1-\hat{p}, \hspace{0.05in} \hat{e}_{ij} = n_i\times \hat{p}_{ij}.\] \begin{itemize} \item In general, if $q$ parameters are estimated, the $df$ of the Pearson's chi-square distribution is reduced by $q$. \item The Pearson chi-square test is a goodness-of-fit test, the smaller the chi-square value, the better goodness-of-fit by the model (Null hypothesis of the test). \item This is a one-sided test. $p$-value of the test is $P(\chi^2_{df} > \chi^2_0)$ where $\chi^2_0$ is the observed $\chi^2$ value from the samples. \end{itemize} \end{frame} \begin{frame} \frametitle{r-Sample Binomial} \crd{3). An example (faked) :} It is believed that about 20\% Florida families have 3 or more children. Three samples were drawn from three cities. The results are in the following table: \begin{center} \begin{tabular}{llll}\hline\hline City & Sample Size & Yes & No \\ \hline Tallahassee & 50 & 20 & 30 \\ Miami & 100 & 25 & 75 \\ Jacksonville& 50 & 15 & 35 \\ \hline \end{tabular} \end{center} \vspace{0.1in} \crd{a). Test 1} \begin{tabular}{ll} & $H_0: p_i \equiv 0.2, \hspace{0.1in} i=1, 2, 3$, \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \end{frame} \begin{frame} \frametitle{r-Sample Binomial} \begin{center} \begin{tabular}{llll}\hline\hline City & Sample Size & Yes ($e_{i1}$) & No ($e_{i2}$) \\ \hline Tallahassee & 50 & 20 (10) & 30 (40) \\ Miami & 100 & 25 (20) & 75 (80) \\ Jacksonville& 50 & 15 (10) & 35 (40) \\ \hline \end{tabular} \end{center} \vspace{0.1in} \[ T = \sum_{i=1}^3\sum_{j=1}^2\frac{(O_{ij}-e_{ij})^2}{e_{ij}} \sim \chi^2_{3}. \] The observed $\chi^2$-value if $T_0 = 17.18$. The $p$-value of the test is \[ p = P(\chi^2_3 > 17.18) \approx 0.0005.\] \cb{Conclusion: Reject the null hypothesis.} \end{frame} \begin{frame} \frametitle{r-Sample Binomial} \crd{b). Test 2} \begin{tabular}{ll} & $H_0: p_i \equiv p, \hspace{0.1in} i=1, 2, 3$, \\ & $H_a:$ Ar least one of the equations is not valid. \end{tabular} \vspace{0.1in} \[\hat{p} = \frac{20+25+15}{50+100+50} = \frac{60}{200}=0.3.\] \vspace{0.1in} \[ T = \sum_{i=1}^3\sum_{j=1}^2\frac{(O_{ij}-\hat{e}_{ij})^2}{\hat{e}_{ij}} \sim \chi^2_{2}. \] The observed $\chi^2$-value if $T_0 = 3.57$. The $p$-value of the test is \[ p = P(\chi^2_2 > 3.57) \approx 0.17.\] \cb{Conclusion: Do not reject the null hypothesis.} \end{frame} \subsection[One-Sample Multinomial]{One-Sample Multinomial} \begin{frame} \frametitle{One-Sample Multinomial} \crd{The Multinomial Distribution}: \begin{itemize} \item $n$ independent and identical experiments (trials) are performed, \item $k$ possible outcomes from each experiment: $\{A_1, \ldots, A_k\}$, \item Probability for each outcome: \[ \pi_i = P(A_i), \hspace{0.1in} \pi_1 + \pi_2 + \cdots + \pi_k =1.\] \end{itemize} \[X_i = \mbox{Number of $A_i$'s appeared in the $n$ trials}, \hspace{0.05in} i=1,\ldots, k.\] \cb{Joint probability mass function for $\{X_1, \ldots, X_k\}$}: \[ P(X_1=O_1, \ldots, X_k=O_k) = \frac{n!}{\prod_{i=1}^kO_i!}\prod_{i=1}^k\pi_i^{O_i}.\] \end{frame} \begin{frame} \frametitle{One-Sample Multinomial} \crd{It is well known that} \begin{itemize} \item $(X_1, \ldots, X_k)$ satisfies $X_1+X_2+\cdots + X_k =n$, \item E($X_i)=n\pi_i$, \hspace{0.1in} Var($X_i)=n\pi_i(1-\pi_i)$, \item Cov($X_i, X_j) = -n\pi_i\pi_j$ for $i\neq j$. \end{itemize} \vspace{0.1in} \crd{Testing hypothesis} \begin{tabular}{ll} & $H_0: \pi_i \equiv \pi_{i0}, \hspace{0.1in} i=1, \ldots, k$, \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.1in} \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{i=1}^k \frac{(O_{i}-e_{i})^2}{e_{i}} \sim \chi^2_{k-1}, \hspace{0.05in} \crd{e_{i}\geq 5}, \end{eqnarray*} where under the null hypothesis $H_0$, $e_i=n\pi_{i0}$. \end{frame} \subsection[r-Sample Multinomial]{r-Sample Multinomial} \begin{frame} \frametitle{r-Sample Multinomial} \crd{$r$ independent multinomial populations:} \begin{itemize} \item $r$ independent random vectors $(X_{i1}, X_{i2}, \ldots, X_{ik})$ for $i=1, \ldots, r$, \item Each population is corresponding to a set of probability $(\pi_{i1}, \pi_{i2}, \ldots, \pi_{ik})$ with $\sum_{j=1}^k \pi_{ij} =1$, \item $(X_{i1}, X_{i2}, \ldots, X_{ik})$ satisfies $X_{i1}+\cdots+X_{ik} = n_i$. \end{itemize} \vspace{0.2in} \cb{Joint probability mass function for $\{X_{i1}, \ldots, X_{ik}\}$}: \[ P(X_{i1}=O_{i1}, \ldots, X_{ik}=O_{ik}) = \frac{n_i!}{\prod_{j=1}^kO_{ij}!}\prod_{j=1}^k\pi_{ij}^{O_{ij}}.\] \end{frame} \begin{frame} \frametitle{r-Sample Multinomial} \crd{1). Testing the probabilities for each population.} \begin{tabular}{ll} & $H_0: \pi_{ij}= \pi_{ij}^{(0)}$, \hspace{0.1in} j=1, \ldots, k; i=1, \ldots, r. \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.2in} \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{i=1}^r\sum_{j=1}^k \frac{(O_{ij}-e_{ij})^2}{e_{ij}} \sim \chi^2_{r(k-1)}, \hspace{0.05in} \crd{e_{ij}\geq 5}, \end{eqnarray*} where under the null hypothesis $H_0$, $e_{ij}=n_i\pi_{ij}^{(0)}$. \end{frame} \begin{frame} \frametitle{r-Sample Multinomial} \crd{2). Testing the $r$ populations are identical:} \begin{tabular}{ll} & $H_0: \pi_{1j}= \pi_{2j} = \cdots = \pi_{rj} \equiv \pi_{j}$, \hspace{0.1in} j=1, \ldots, k. \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.1in} The $(k-1)$ unknown proportions $(\pi_1, \ldots, \pi_{k-1})$ ($\pi_k = 1 -\sum_{j=1}^{k-1}\pi_j$) are estimated by \[\hat{\pi}_j =\frac{O_{1j} + O_{2j} +\cdots O_{rj}}{n_1+n_2+\cdots+n_r}.\] \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{i=1}^r\sum_{j=1}^k \frac{(O_{ij}-\hat{e}_{ij})^2}{\hat{e}_{ij}} \sim \chi^2_{(r-1)(k-1)}, \hspace{0.05in} \crd{\hat{e}_{ij}\geq 5}, \end{eqnarray*} where under the null hypothesis $H_0$, $\hat{e}_{ij}=n_i\hat{\pi}_{j}$. \end{frame} \subsection[Contingency Tables]{Contingency Tables} \begin{frame} \frametitle{Contingency Tables} \cb{General Form: Two-Way Classifications} \begin{itemize} \item Two variables ($X, Y$), $X$ with $I$ categories and $Y$ with $J$ categories, form a two-way table with $IJ$ cells. Each cell contains a frequency count of outcomes. \item Quite often, $X$ is an explanatory variable and $Y$ is a response variable. But $X$ and $Y$ can be two response variables. \end{itemize} \crd{ An example:} \begin{tabular}{llllll|c}\hline\hline & \multicolumn{5}{c|}{\underline{\hspace{0.4in} Hospital \hspace{0.4in}} } & \\ Result & A & B & C & D & E & Row Total \\ \hline No Improvement & $n_{11}$ & $n_{12}$ & $n_{13}$ & $n_{14}$ & $n_{15}$ & $n_{1\cdot}$ \\ Partial Restoration & $n_{21}$ & $n_{22}$ & $n_{23}$ & $n_{24}$ & $n_{25}$ & $n_{2\cdot}$ \\ Full Restoration & $n_{31}$ & $n_{32}$ & $n_{33}$ & $n_{34}$ & $n_{35}$ & $n_{3\cdot}$ \\ \hline Column Total & $n_{\cdot 1}$& $n_{\cdot 2}$& $n_{\cdot 3}$& $n_{\cdot 4}$& $n_{\cdot 5}$ & n \\ \hline \hline \end{tabular} \end{frame} \begin{frame} \frametitle{Contingency Tables} \crd{Three Different Sampling Schemes.} \vspace{0.1in} \cb{(i). Total sample size $n$ is fixed, but column and row totals are random.} \begin{itemize} \item the researcher decides the sample size $n$ first based on population variance, type I and type II errors, and other factors. \item Use the hospital-patient example, in a retrospective study, the researcher randomly selects $n$ patients' records from the patients' records in the five hospitals; \item in a prospective study, the researcher goes to the five hospitals and collects patients' records in the next several years until $n$ records are obtained. \item Both row totals and column totals are random. \end{itemize} \end{frame} \begin{frame} \frametitle{Contingency Tables} \cg{Distribution Theory:} Let $X_{ij}$ be the frequency count in the cell $(i, j)$. The random vector $\{X_{ij}, i=1, \ldots, I, j=1, \ldots, J\}$ has a multinomial distribution with cell probabilities $\{\pi_{ij}, i=1, \ldots, I, j=1, \ldots, J\}$ where $\sum_{i=1}^I\sum_{j=1}^J \pi_{ij} =1$. \vspace{0.1in} \crd{1). Testing given probabilities.} \begin{tabular}{ll} & $H_0: \pi_{ij}= \pi_{ij}^{(0)}$, \hspace{0.1in} j=1, \ldots, J; i=1, \ldots, I. \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.1in} \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{i=1}^I\sum_{j=1}^J \frac{(O_{ij}-e_{ij})^2}{e_{ij}} \sim \chi^2_{k-1}, \hspace{0.05in} \crd{e_{ij}\geq 5}, \\[-0.05in] \end{eqnarray*} where $k=IJ$ and under the null hypothesis $H_0$, $e_{ij}=n_i\pi_{ij}^{(0)}$. \end{frame} \begin{frame} \frametitle{Contingency Tables} \crd{Cell probabilities:} \begin{tabular}{cllll|c}\hline\hline & \multicolumn{4}{c|}{\underline{\hspace{0.4in}Y\hspace{0.4in}} } & \\ X & 1 & 2 & $\ldots $ & J & Row Total \\ \hline 1 & $\pi_{11}$ & $\pi_{12}$ & $\ldots $ & $\pi_{1J}$ & $\pi_{1\cdot}$ \\ 2 & $\pi_{21}$ & $\pi_{22}$ & $\ldots $ & $\pi_{2J}$ & $\pi_{2\cdot}$ \\ $\vdots$ & $\vdots$ & $\vdots$ & $\ldots$ & $\vdots$ & $\vdots$ \\ I & $\pi_{I1}$ & $\pi_{I2}$ & $\ldots $ & $\pi_{IJ}$ & $\pi_{I\cdot}$ \\ \hline Column Total & $\pi_{\cdot 1}$& $\pi_{\cdot 2}$& $\ldots$& $\pi_{\cdot J}$ & 1\\ \hline \hline \end{tabular} \vspace{0.2in} \crd{2). Test the two categorical variables are independent (not associated)} \begin{tabular}{ll} & $H_0: \pi_{ij}= \pi_{i\cdot}\pi_{\cdot j}$, \hspace{0.1in} j=1, \ldots, J; i=1, \ldots, I. \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \end{frame} \begin{frame} \frametitle{Contingency Tables} The $[(I-1)+(J-1)]$ unknown marginal probabilities \[\{\pi_{1\cdot}, \ldots, \pi_{(I-1, \cdot)}, \pi_{\cdot 1}, \ldots, \pi_{(\cdot, J-1)} \}\] are estimated by \[\hat{\pi}_{i\cdot} = \frac{n_{i\cdot}}{n}, i=1, \ldots, (I-1); \hspace{0.1in} \hat{\pi}_{\cdot j} = \frac{n_{\cdot j}}{n}, j=1, \ldots, (J-1).\] \cb{Test Statistic ($\hat{\pi}_{ij} = \hat{\pi}_{i\cdot}\hat{\pi}_{\cdot j}$):} \begin{eqnarray*} T &=& \sum_{i=1}^I\sum_{j=1}^J \frac{(O_{ij}-\hat{e}_{ij})^2}{\hat{e}_{ij}} \sim \chi^2_{(I-1)(J-1)}, \hspace{0.05in} \crd{\hat{e}_{ij}\geq 5}, \\[-0.05in] \end{eqnarray*} where under the null hypothesis $H_0$, $\hat{e}_{ij}=n\hat{\pi}_{ij}$. \end{frame} \begin{frame} \frametitle{Contingency Tables} \cb{(ii). One set of marginal totals is random, the other set fixed.} \begin{itemize} \item the sampler controls the design and specifies the column totals or the row totals. \item Use the hospital-patient example, in a retrospective study, the researcher randomly selects 100 patients' records from each of the five hospitals; \item in a prospective study, the researcher divides patients into five groups and send each group to each of the five hospitals \crd{(maybe infeasible in practice, fine with biological experiments)}. \end{itemize} \end{frame} \begin{frame} \frametitle{Contingency Tables} Consider the hospital example and suppose that the column totals (patients had the surgery in each hospital) are fixed, i.e., $\{n_{\cdot 1}, n_{\cdot 2}, n_{\cdot 3}, n_{\cdot 4}, n_{\cdot 5}\}$ are fixed. \vspace{0.2in} \begin{tabular}{llllll|c}\hline\hline & \multicolumn{5}{c|}{\underline{\hspace{0.4in} Hospital \hspace{0.4in}} } & \\ Result & A & B & C & D & E & Row Total \\ \hline No Improvement & $n_{11}$ & $n_{12}$ & $n_{13}$ & $n_{14}$ & $n_{15}$ & $n_{1\cdot}$ \\ Partial Restoration & $n_{21}$ & $n_{22}$ & $n_{23}$ & $n_{24}$ & $n_{25}$ & $n_{2\cdot}$ \\ Full Restoration & $n_{31}$ & $n_{32}$ & $n_{33}$ & $n_{34}$ & $n_{35}$ & $n_{3\cdot}$ \\ \hline Column Total & $n_{\cdot 1}$& $n_{\cdot 2}$& $n_{\cdot 3}$& $n_{\cdot 4}$& $n_{\cdot 5}$ & n \\ \hline \hline \end{tabular} \vspace{0.2in} \cg{Distribution Theory:} Each column forms a multinomial distribution. Five independent multinomial distributions. \end{frame} \begin{frame} \frametitle{Contingency Tables} \crd{1). Testing given probabilities.} \begin{tabular}{ll} & $H_0: \pi_{ij}= \pi_{ij}^{(0)}$, \hspace{0.1in} j=1, \ldots, 5; i=1, \ldots, 3. \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.2in} \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{j=1}^5\sum_{i=1}^3 \frac{(O_{ij}-e_{ij})^2}{e_{ij}} \sim \chi^2_{5(3-1)}, \hspace{0.05in} \crd{e_{ij}\geq 5}, \\[-0.05in] \end{eqnarray*} where under the null hypothesis $H_0$, $e_{ij}=n_{\cdot j}\pi_{ij}^{(0)}$. \end{frame} \begin{frame} \frametitle{Contingency Tables} \crd{2). Testing the five distributions are identical.} \begin{tabular}{ll} & $H_0: \pi_{i1}= \cdots = \pi_{i5} \equiv\pi_{i}$, \hspace{0.1in} i=1, 2, 3. \\ & $H_a:$ At least one of the equations is not valid. \end{tabular} \vspace{0.1in} \cg{Unknown probabilities $\pi_1$ and $\pi_2$ need to be estimated.} \vspace{0.1in} \cb{Test Statistic:} \begin{eqnarray*} T &=& \sum_{j=1}^5\sum_{i=1}^3 \frac{(O_{ij}-e_{ij})^2}{e_{ij}} \sim \chi^2_{(5-1)(3-1)}, \hspace{0.05in} \crd{e_{ij}\geq 5}, \\[-0.05in] \end{eqnarray*} where under the null hypothesis $H_0$, $\hat{\pi}_i =\frac{n_{i\cdot}}{n}$ and $e_{ij}=n_{\cdot j}\hat{\pi}_{i}$. \end{frame} \begin{frame} \frametitle{Contingency Tables} \cb{(iii). Marginal totals and total sample size $N$ are all random.} \begin{itemize} \item Use the hospital-patient example, the sampler goes to the five hospitals in the next two years and collects the records of all patients' who have the surgical procedure. \end{itemize} \vspace{0.1in} \cg{Distribution Theory:} The cell frequencies, $\{n_{11}, \ldots, n_{35}\}$, are independent Poisson random variables, i.e., \[ n_{ij} \sim Poisson(\lambda_{ij}), \hspace{0.2in} \sum_{i=1}^3\sum_{j=1}^5n_{ij} \sim Poisson\left(\sum_{i=1}^3\sum_{j=1}^5\lambda_{ij}\right).\] \end{frame} \begin{frame} \frametitle{Contingency Tables} \cg{Distribution Theory:} Given $N=n$, the random vector $\{n_{ij}, i=1,2,3; j=1, \cdots, 5\}$ has a conditional multinomial distribution with the cell probability \[\pi_{ij} = \frac{\lambda_{ij}}{\sum_{i=1}^3\sum_{j=1}^5\lambda_{ij}}.\] \vspace{0.1in} \cb{Exercise.} Suppose that $X\sim Poisson(\lambda_1)$, $Y\sim Poisson(\lambda_2)$, and $X$ and $Y$ are independent. Given $X+Y = n$, $X \sim Binomial\left(\frac{\lambda_1}{\lambda_1+\lambda_2}, n\right)$ and $Y \sim Binomial\left(\frac{\lambda_2}{\lambda_1+\lambda_2}, n\right)$. \vspace{0.1in} \cg{Analysis:} Analyze the results based on the conditions distribution (Same tests as in case (i)). \end{frame} \end{document} _