\documentclass[11pt]{article} \usepackage[margin=1in,bottom=.5in,includehead,includefoot]{geometry} \usepackage{hyperref} \usepackage{language} \usepackage{alltt} \usepackage{fancyhdr} \pagestyle{fancy} \fancyhf{} %% Now begin customising things. See the fancyhdr docs for more info. \chead{} \lhead[\sf \thepage]{\sf \leftmark} \rhead[\sf \leftmark]{\sf \thepage} \lfoot{} \cfoot{Statistical Sleuth in R: Chapter 3} \rfoot{} \newcounter{myenumi} \newcommand{\saveenumi}{\setcounter{myenumi}{\value{enumi}}} \newcommand{\reuseenumi}{\setcounter{enumi}{\value{myenumi}}} \pagestyle{fancy} \def\R{{\sf R}} \def\Rstudio{{\sf RStudio}} \def\RStudio{{\sf RStudio}} \def\term#1{\textbf{#1}} \def\tab#1{{\sf #1}} \usepackage{relsize} \newlength{\tempfmlength} \newsavebox{\fmbox} \newenvironment{fmpage}[1] { \medskip \setlength{\tempfmlength}{#1} \begin{lrbox}{\fmbox} \begin{minipage}{#1} \vspace*{.02\tempfmlength} \hfill \begin{minipage}{.95 \tempfmlength}} {\end{minipage}\hfill \vspace*{.015\tempfmlength} \end{minipage}\end{lrbox}\fbox{\usebox{\fmbox}} \medskip } \newenvironment{boxedText}[1][.98\textwidth]% {% \begin{center} \begin{fmpage}{#1} }% {% \end{fmpage} \end{center} } \newenvironment{boxedTable}[2][tbp]% {% \begin{table}[#1] \refstepcounter{table} \begin{center} \begin{fmpage}{.98\textwidth} \begin{center} \sf \large Box~\expandafter\thetable. #2 \end{center} \medskip }% {% \end{fmpage} \end{center} \end{table} % need to do something about exercises that follow boxedTable } \newcommand{\cran}{\href{http://www.R-project.org/}{CRAN}} \title{The Statistical Sleuth in R: \\ Chapter 3} \author{ Ruobing Zhang \and Kate Aloisio \and Nicholas J. Horton\thanks{Department of Mathematics, Amherst College, nhorton@amherst.edu} } \date{\today} \begin{document} \maketitle \tableofcontents %\parindent=0pt <>= print.pval = function(pval) { threshold = 0.0001 return(ifelse(pval < threshold, paste("p<", sprintf("%.4f", threshold), sep=""), ifelse(pval > 0.1, paste("p=",round(pval, 2), sep=""), paste("p=", round(pval, 3), sep="")))) } @ <>= opts_chunk$set( dev="pdf", fig.path="figures/", fig.height=3, fig.width=4, out.width=".47\\textwidth", fig.keep="high", fig.show="hold", fig.align="center", prompt=TRUE, # show the prompts; but perhaps we should not do this comment=NA # turn off commenting of ouput (but perhaps we should not do this either ) @ <>= require(Sleuth2) require(mosaic) trellis.par.set(theme=col.mosaic()) # get a better color scheme set.seed(123) # this allows for code formatting inline. Use \Sexpr{'function(x,y)'}, for exmaple. knit_hooks$set(inline = function(x){ if (is.numeric(x)) return(knitr:::format_sci(x, 'latex')) x = as.character(x) h = knitr:::hilight_source(x, 'latex', list(prompt=FALSE, size='normalsize')) h = gsub("([_#$%&])", "\\\\\\1", h) h = gsub('(["\'])', '\\1{}', h) gsub('^\\\\begin\\{alltt\\}\\s*|\\\\end\\{alltt\\}\\s*$', '', h) }) showOriginal=FALSE showNew=TRUE @ \section{Introduction} This document is intended to help describe how to undertake analyses introduced as examples in the Second Edition of the \emph{Statistical Sleuth} (2002) by Fred Ramsey and Dan Schafer. More information about the book can be found at \url{http://www.proaxis.com/~panorama/home.htm}. This file as well as the associated \pkg{knitr} reproducible analysis source file can be found at \url{http://www.amherst.edu/~nhorton/sleuth}. This work leverages initiatives undertaken by Project MOSAIC (\url{http://www.mosaic-web.org}), an NSF-funded effort to improve the teaching of statistics, calculus, science and computing in the undergraduate curriculum. In particular, we utilize the \pkg{mosaic} package, which was written to simplify the use of R for introductory statistics courses. A short summary of the R needed to teach introductory statistics can be found in the mosaic package vignette (\url{http://cran.r-project.org/web/packages/mosaic/vignettes/MinimalR.pdf}). To use a package within R, it must be installed (one time), and loaded (each session). The package can be installed using the following command: <>= install.packages('mosaic') # note the quotation marks @ Once this is installed, it can be loaded by running the command: <>= require(mosaic) @ This needs to be done once per session. In addition the data files for the \emph{Sleuth} case studies can be accessed by installing the \pkg{Sleuth2} package. <>= install.packages('Sleuth2') # note the quotation marks @ <>= require(Sleuth2) @ We also set some options to improve legibility of graphs and output. <>= trellis.par.set(theme=col.mosaic()) # get a better color scheme for lattice options(digits=3, show.signif.stars=FALSE) @ The specific goal of this document is to demonstrate how to calculate the quantities described in \emph{Sleuth} Chapter 3: A Closer Look at Assumptions using R. \section{Cloud Seeding to Increase Rainfall} Does seeding clouds lead to more rainfall? This is the question being addressed by case study 3.1 in the \emph{Sleuth}. \subsection{Summary statistics and graphical displays (untransformed)} We begin by reading the data and summarizing the variables. <<>>= summary(case0301) favstats(Rainfall ~ Treatment, data=case0301) @ A total of \Sexpr{nrow(case0301)} subjects were included in this data: \Sexpr{nrow(subset(case0301, Treatment=="Seeded"))} seeded days and \Sexpr{nrow(subset(case0301, Treatment=="Unseeded"))} unseeded days (Display 3.1, page 57). <>= bwplot(Rainfall ~ Treatment, data=case0301) @ <>= densityplot(~Rainfall, groups=Treatment, auto.key=TRUE, data=case0301) @ According to the boxplot and the density plot, the rainfall from seeded days seems to be larger than unseeded days. Both density curves are highly skewed to the right. \subsection{Summary statistics and graphical display (transformed)} The skewness suggests there is a need to apply the logarithmic transformation. The transformed data is shown on page 71 (Display 3.9). <<>>= case0301 = transform(case0301, lograin=log(Rainfall)) favstats(lograin ~ Treatment, data=case0301) @ <>= bwplot(lograin ~ Treatment, data=case0301) @ <>= densityplot(~lograin, groups=Treatment, auto.key=TRUE, data=case0301) @ The log transformation reduces skewness of these two distributions. \subsection{Inferential procedures (two-sample t-test)} <<>>= t.test(Rainfall ~ Treatment, var.equal=FALSE, data=case0301) t.test(Rainfall ~ Treatment, var.equal=TRUE, data=case0301) @ The following corresponds to the calculations on page 71. <<>>= summary(lm(lograin ~ Treatment, data=case0301)) ttestlog = t.test(lograin ~ Treatment, data=case0301); ttestlog @ The two-sided $p$-value is $p=0.014$ and the 95\% confidence interval is between \Sexpr{round(ttestlog$conf.int[1], 2)} and \Sexpr{round(ttestlog$conf.int[2], 2)}. \subsection{Interpretation of log model} The following code is used to calculate the ``Summary of Statistical Findings" on page 57. First, we want to calculate the multiplier. <<>>= obslogdiff = -diff(mean(lograin ~ Treatment, data=case0301)); obslogdiff multiplier = exp(obslogdiff); multiplier @ Next we can calculate the 95\% confidence interval for the multiplier. <<>>= ttestlog$conf.int exp(ttestlog$conf.int) @ \section{Effects of Agent Orange on Troops in Vietnam} Is dioxin concentration related to veteran status? This is the question being addressed by case study 3.2 in the \emph{Sleuth}. \subsection{Summary statistics and graphical display} We begin by reading the data and summarizing the variables. <<>>= summary(case0302) favstats(Dioxin ~ Veteran, data=case0302) @ A total of \Sexpr{nrow(case0302)} veterans were included in this data: \Sexpr{nrow(subset(case0302, Veteran=="Vietnam"))} served in Vietnam during 1967 and 1968 and \Sexpr{nrow(subset(case0302, Veteran=="Other"))} served in US or Germany during 1965 and 1971. <>= bwplot(Veteran ~ Dioxin, data=case0302) @ <>= densityplot(~Dioxin, groups=Veteran, auto.key=TRUE, data=case0302) @ Both distributions are highly skewed to the right. \subsection{Inferential procedures (two-sample t-test)} The following code is used to calculate the ``Summary of Statistical Findings" on page 60. <<>>= t.test(Dioxin ~ Veteran, var.equal=TRUE, alternative="less", data=case0302) t.test(Dioxin ~ Veteran, var.equal=TRUE, data=case0302)$conf.int @ So the one-sided $p$-value from a two-sample $t$-test is \Sexpr{pval(t.test(Dioxin ~ Veteran, var.equal=TRUE, alternative="less", data=case0302))}. The 95\% confidence interval is (\Sexpr{round(t.test(Dioxin ~ Veteran, var.equal=TRUE, data=case0302)$conf.int[1], 2)}, \Sexpr{round(t.test(Dioxin ~ Veteran, var.equal=TRUE, data=case0302)$conf.int[2], 2)}). \subsection{Removing outliers} We will remove two extreme observations from the data. First we remove observation 646 and perform a $t$-test (Display 3.7, page 67). <<>>= case0302.2 = case0302[-c(646), ] t.test(Dioxin ~ Veteran, alternative="less", data=case0302.2) @ Next we remove observations 645 and 646 and perform a $t$-test. <<>>= dim(case0302) case0302.3 = case0302[-c(645, 646), ] dim(case0302.3) t.test(Dioxin ~ Veteran, alternative="less", data=case0302.3) @ Notice that after removing these outliers, the $p$-value and the confidence interval have changed but the substantive conclusion is unchanged. \end{document}