--- title: "IS5 in R: Comparing Groups (Chapter 17)" author: "Nicholas Horton (nhorton@amherst.edu)" date: "2025-01-20" date-format: iso format: pdf toc: true editor: source --- ```{r} #| label: setup #| include: false library(mosaic) library(tidyverse) ``` ## Introduction and background This document is intended to help describe how to undertake analyses introduced as examples in the Fifth Edition of *Intro Stats* (2018) by De Veaux, Velleman, and Bock. This file as well as the associated Quarto reproducible analysis source file used to create it can be found at http://nhorton.people.amherst.edu/is5. This work leverages initiatives undertaken by Project MOSAIC (http://www.mosaic-web.org), an NSF-funded effort to improve the teaching of statistics, calculus, science and computing in the undergraduate curriculum. In particular, we utilize the `mosaic` package, which was written to simplify the use of R for introductory statistics courses. A short summary of the R needed to teach introductory statistics can be found in the mosaic package vignettes (https://cran.r-project.org/web/packages/mosaic). A paper describing the mosaic approach was published in the *R Journal*: https://journal.r-project.org/archive/2017/RJ-2017-024. We begin by loading packages that will be required for our analyses. ```{r} library(mosaic) library(tidyverse) ``` ### Section 17.1: A Confidence Interval for the Difference Between Two Proportions ```{r} # Creating a data frame for Seatbelts Seatbelts <- bind_rows( do(2777) * data.frame(passenger = "F", belted = TRUE), do(4208 - 2777) * data.frame(passenger = "F", belted = FALSE), do(1363) * data.frame(passenger = "M", belted = TRUE), do(2763 - 1363) * data.frame(passenger = "M", belted = FALSE) ) |> select(passenger, belted) ``` The `mosaic::do()` function constructs the correct number of rows for the data frame from provided cell counts. ```{r} #| label: belted #| cache: true set.seed(234) num_sim <- 10000 # What does do() do? abs(diffmean(belted ~ passenger, data = resample(Seatbelts))) # Difference of proportions from one random resample abs(diffmean(belted ~ passenger, data = resample(Seatbelts))) # Difference of proportions from another random resample do(2) * # Calculates two differences abs(diffmean(belted ~ passenger, data = resample(Seatbelts))) # We want to calculate num_sim resampled differences of proportions seatbeltresamples <- do(num_sim) * abs(diffmean(belted ~ passenger, data = resample(Seatbelts))) ``` For more information about `resample()`, refer to the resampling vignette: https://cran.r-project.org/web/packages/mosaic/vignettes/Resampling.html ```{r} # Figure 17.1, page 542 gf_histogram(~ diffmean, data = seatbeltresamples) |> gf_labs(x = "Difference of Proportions", y = "# of Resamples") ``` #### Example 17.1: Finding the Standard Error of a Difference in Proportions We begin with some wrangling to create the dataset. ```{r} # Creating the data set for online profiles OnlineProf <- bind_rows( do(141) * data.frame(gender = "M", profile = TRUE), # 248 * .57 rounds to 141 do(107) * data.frame(gender = "M", profile = FALSE), # 248 - 141 do(179) * data.frame(gender = "F", profile = TRUE), do(77) * data.frame(gender = "F", profile = FALSE) ) tally(~ gender, data = OnlineProf) ``` ```{r} OnlineProfM <- OnlineProf |> filter(gender == "M") # Make a data set for male observations nM <- nrow(OnlineProfM) nM # n for males propMyes <- mean(~profile, data = OnlineProfM) propMyes # p for males sepboys <- ((propMyes * (1 - propMyes)) / nM)^.5 sepboys # SE for males ``` ```{r} OnlineProfF <- OnlineProf |> filter(gender == "F") # Make a data set for female observations nF <- nrow(OnlineProfF) nF # n for females propFyes <- mean(~profile, data = OnlineProfF) propFyes # p for females sepgirls <- ((propFyes * (1 - propFyes)) / nF)^.5 sepgirls # SE for females ``` ```{r} sep <- (sepboys^2 + sepgirls^2)^.5 sep # overall SE ``` #### Example 17.2: Finding a Two-Proportion *z*-Interval We can calculate the desired Z interval. ```{r} zstats <- qnorm(p = c(.025, .975)) (propFyes - propMyes) + zstats * sep # Or, you can use: prop.test(x = c(179, 141), n = c(nF, nM), correct = FALSE) ``` The `prop.test()` function can be used to find confidence intervals and p-values of both one or two proportion z-tests. ### Section 17.2: Assumptions and Conditions for Comparing Proportions ### Section 17.3: The Two-Sample *z*-Test: Testing for the Difference Between Proportions #### Step-By-Step Example: A Two-Proportion *z*-Test Again, we need to create the data table of counts. ```{r} # Create the data set SleepHabits <- bind_rows( do(205) * data.frame(gen = "GenY", internet = TRUE), do(293 - 205) * data.frame(gen = "GenY", internet = FALSE), do(235) * data.frame(gen = "GenX", internet = TRUE), do(469 - 235) * data.frame(gen = "GenX", internet = FALSE) ) ``` ```{r} # Mechanics ngeny <- nrow(filter(SleepHabits, gen == "GenY")) ngeny # n for GenY ygeny <- nrow(filter(SleepHabits, gen == "GenY" & internet == TRUE)) ygeny # y for GenY pgeny <- mean(~ internet, data = filter(SleepHabits, gen == "GenY")) pgeny # proportion for GenY ngenx <- nrow(filter(SleepHabits, gen == "GenX")) ngenx # n for GenX ygenx <- nrow(filter(SleepHabits, gen == "GenX" & internet == TRUE)) ygenx # y for GenX pgenx <- mean(~ internet, data = filter(SleepHabits, gen == "GenX")) pgenx # proportion for GenX sepgen <- ((pgeny * (1 - pgeny)) / ngeny + (pgenx * (1 - pgenx)) / ngenx)^.5 sepgen # overall SE pdiff <- pgeny - pgenx pdiff # difference between proportions z <- (pdiff - 0) / sepgen z 2 * pnorm(q = z, lower.tail = FALSE) ``` ### Section 17.4: A Confidence Interval for the Difference Between Two Means The `t.test()` function can be used to generate a confidence interval for the difference between two means. The `conf.level` option can be used to create different intervals. #### Example 17.7: Finding a Confidence Interval for the Difference in Sample Means We can calculate the confidence interval using summary statistics. ```{r} # page 555 nord <- 27 # n for ordinary bowls nref <- 27 # n for refilling bowls yord <- 8.5 # y for ordinary bowls yref <- 14.7 # y for refilling bowls sord <- 6.1 # standard deviation for ordinary bowls sref <- 8.4 # standard deviation for refilling bowls seys <- 2.0 # overall SE diffy <- yref - yord # difference between y's is 6.2 tstats <- qt(p = c(.025, .975), df = 47.46) tstats me <- tstats * seys me # margin of error diffy + me # confidence interval ``` ### Section 17.5: The Two-Sample *t*-Test: Testing for the Difference Between Two Means #### Step-By-Step Example: A Two-Sample *t*-Test for the Difference Between the Two Means We begin by reading the data. ```{r} #| message: false # page 556 BuyingCam <- read_csv("http://nhorton.people.amherst.edu/is5/data/Buy_from_a_friend.csv") ``` ```{r} #| message: false #| warning: false head(BuyingCam) # before reshaping using `pivot_longer()` BuyingCam <- BuyingCam |> pivot_longer( Friend:Stranger, names_to = "buying_type", values_to = "amount_offered" ) head(BuyingCam) # after reshaping # Model gf_boxplot(amount_offered ~ buying_type, fill = ~ buying_type, data = BuyingCam) |> gf_labs(x = "Buying From", y = "Amount Offered ($)", fill = "") BuyingCam |> filter(buying_type == "Friend") |> gf_histogram(~ amount_offered, binwidth = 25, center = 12) |> gf_labs(x = "Buy from Friend", y = "") BuyingCam |> filter(buying_type == "Stranger") |> gf_histogram(~ amount_offered, binwidth = 50, center = 20) |> gf_labs(x = "Buy from Stranger", y = "") ``` We can replicate the analyses on pages 557-558. ```{r} df_stats(amount_offered ~ buying_type, data = BuyingCam) t.test(amount_offered ~ buying_type, data = BuyingCam) ``` ### Section 17.6: Randomization Tests and Confidence Intervals for Two Means We begin by reading in the Cars dataset. ```{r} #| message: false Cars <- readr::read_csv("http://nhorton.people.amherst.edu/is5/data/Car_speeds.csv") # Figure 17.2 (page 560) is the same as Figure 4.4 (page 102) df_stats(speed ~ direction, data = Cars) ``` ```{r} #| label: Carsims #| cache: true #| warning: false set.seed(23456) num_sim <- 10000 CarSims <- do(num_sim) * diffmean(speed ~ shuffle(direction), data = Cars) # Figure 17.3, page 560 gf_histogram(~ diffmean, data = CarSims, binwidth = 0.1, center = 0.05) |> gf_vline(xintercept = 2.53) |> gf_labs(x = "Differences in Means from 10,000 Trials", y = "# of Trials") ``` ```{r} #| label: Carboots #| cache: true set.seed(32453) num_sim <- 10000 CarBoots <- do(num_sim) * diffmean(speed ~ direction, data = resample(Cars)) qdata(~ diffmean, p = c(.025, .975), data = CarBoots) CarBoots <- CarBoots |> mutate(interval = ifelse( diffmean > 1.88 & diffmean < 3.19, "Within 95% Confidence", # if TRUE "Outside 95% Confidence" # if FALSE )) ``` ```{r} #| warning: false # Figure 17.4 gf_histogram( ~ diffmean, fill = ~ interval, data = CarBoots, binwidth = 0.2, center = 0.1 ) |> gf_vline(xintercept = 1.88) |> gf_vline(xintercept = 3.19) |> gf_labs(x = "Difference in Means", y = "# of Trials") + guides(fill = FALSE) # to remove the legend ``` ### Section 17.7: Pooling The pooled variance t.test can be generated by using the option `var.equal = TRUE`. ```{r} t.test(amount_offered ~ buying_type, var.equal = TRUE, data = BuyingCam) ``` ### Section 17.8: The Standard Deviation of a Difference