--- title: "IS5 in R: Inferences for Regression (Chapter 20)" author: "Nicholas Horton (nhorton@amherst.edu)" date: "2025-01-20" date-format: iso format: pdf toc: true editor: source --- ```{r} #| label: setup #| include: false library(mosaic) library(tidyverse) ``` ## Introduction and background This document is intended to help describe how to undertake analyses introduced as examples in the Fifth Edition of *Intro Stats* (2018) by De Veaux, Velleman, and Bock. This file as well as the associated Quarto reproducible analysis source file used to create it can be found at http://nhorton.people.amherst.edu/is5. This work leverages initiatives undertaken by Project MOSAIC (http://www.mosaic-web.org), an NSF-funded effort to improve the teaching of statistics, calculus, science and computing in the undergraduate curriculum. In particular, we utilize the `mosaic` package, which was written to simplify the use of R for introductory statistics courses. A short summary of the R needed to teach introductory statistics can be found in the mosaic package vignettes (https://cran.r-project.org/web/packages/mosaic). A paper describing the mosaic approach was published in the *R Journal*: https://journal.r-project.org/archive/2017/RJ-2017-024. We begin by loading packages that will be required for our analyses. ```{r} library(mosaic) library(tidyverse) library(broom) ``` ## Chapter 20: Inferences for Regression ```{r} #| message: false BodyFat <- read_csv("http://nhorton.people.amherst.edu/is5/data/Bodyfat.csv") |> janitor::clean_names() ``` By default, `read_csv()` prints the variable names. These messages have been suppressed using the `message: false` code chunk option to save space and improve readability. Here we use the `clean_names()` function from the `janitor` package to sanitize the names of the columns (which would otherwise contain special characters or whitespace). ```{r} #| message: false # Figure 20.1, page 642 gf_point(pct_bf ~ waist, data = BodyFat) |> gf_smooth() |> # to show linear relationship gf_labs(x = "Waist (in.)", y = "% Body Fat") ``` ### Section 20.1: The Regression Model ```{r} lm(pct_bf ~ waist, data = BodyFat) ``` ```{r} #| echo: false # warning suppression for linewidth argument ``` ```{r} #| warning: false # Figure 20.2 gf_histogram(~ pct_bf, data = BodyFat, binwidth = 2.5, center = 1.25) |> gf_labs(x = "% Body Fat", y = "# of Men") # Figure 20.3 (reinterpreted with points) BodyFat <- BodyFat |> mutate(roundedwaist = cut(waist, breaks = c(0, 30, 35, 40, 45, Inf), labels = c(1:5))) gf_point(pct_bf ~ jitter(as.numeric(roundedwaist)), data = BodyFat) |> gf_lm() |> gf_labs(y = "% Body Fat", x = "Waist") ``` #### Random Matters: Slopes Vary ```{r} num_samp <- 25 # It's too messy to do any more than 25 slopesdata <- do(num_samp) * lm(pct_bf ~ waist, data = resample(BodyFat)) ``` For more information about `resample()`, refer to the resampling vignette: https://cran.r-project.org/web/packages/mosaic/vignettes/Resampling.html ```{r} #| fig.height: 8 #| fig.width: 7 slopesdata <- slopesdata |> mutate( at27 = Intercept + waist * 27, at50 = Intercept + waist * 50, color = as.factor(1:num_samp) ) # Figure 20.4, page 644 gf_point(pct_bf ~ waist, data = BodyFat) |> gf_segment(at27 + at50 ~ 27 + 50, data = slopesdata, color = ~ color) |> gf_labs(color = "Sample", x = "Waist (in.)", y = "Percent Body Fat") ``` ```{r} num_samp <- 1000 # To see the shape of the histogram slopesdata <- do(num_samp) * lm(pct_bf ~ waist, data = resample(BodyFat)) # Figure 20.5 gf_histogram(~ waist, data = slopesdata, binwidth = .05, center = .025) |> gf_labs(x = "Slope", y = "# of Trials") ``` For the histogram, we use 1,000 trials. ### Section 20.2: Assumptions and Conditions ```{r} #| message: false # Figure 20.6 is the same as Figure 20.1 # Figure 20.7 (page 645) bodyfatlm <- lm(pct_bf ~ waist, data = BodyFat) gf_point(resid(bodyfatlm) ~ fitted(bodyfatlm)) |> gf_labs(x = "Predicted", y = "Residuals") |> gf_lm() |> gf_smooth() ``` Here we've added a straight line and a smoother to help see any patterns. ```{r} #| message: false Diamonds <- read_csv("http://nhorton.people.amherst.edu/is5/data/Diamonds.csv") |> janitor::clean_names() ``` Here we fit `price` by `carat_size` for diamonds with the color `E`. ```{r} #| message: false diamondlm <- lm(price ~ carat_size, data = filter(Diamonds, color == "E")) # Figure 20.8, page 646 gf_point(resid(diamondlm) ~ fitted(diamondlm)) |> gf_labs(x = "Predicted Values", y = "Residuals") |> gf_lm() |> gf_smooth() ``` Here we see some curvilinearity in the association between the predicted values and the residuals. ```{r} # Figure 20.9 gf_histogram(~ resid(bodyfatlm), binwidth = 2, center = 1) |> gf_labs(x = "Residuals", y = "Count") ``` Figure 20.10 is intended to convey the same idea as Figure 20.3 (page 643). #### Example 20.1: Checking Assumptions and Conditions Note that points are removed to match the results in the textbook! ```{r} #| message: false Craters <- read_csv("http://nhorton.people.amherst.edu/is5/data/Craters.csv") |> janitor::clean_names() |> filter(log_age <= 1.5) # Removed points to match the textbook ``` ```{r} gf_point(log_diam ~ log_age, data = Craters) |> gf_lm() |> gf_labs(x = "Log(Age)", y = "Log(Diameter)") craterlm <- lm(log_diam ~ log_age, data = Craters) gf_point(resid(craterlm) ~ fitted(craterlm)) |> gf_lm() |> gf_labs(x = "Predicted", y = "Residuals") gf_qq(~ resid(craterlm)) |> gf_qqline(linetype = "solid", color = "red") |> gf_labs(x = "Nscores", y = "Residuals") gf_histogram(~ resid(craterlm), binwidth = .5, center = 0.25) |> gf_labs(x = "Residuals", y = "# of Years") ``` #### Step-By-Step Example: Regression Inference The following scatterplot matches Figure 20.1. ```{r} gf_qq(~ resid(bodyfatlm)) |> gf_qqline(linetype = "solid", color = "red") |> gf_labs(x = "Normal Scores", y = "Residuals (% body fat)") msummary(bodyfatlm) ``` ### Section 20.3: Regression Inference and Intuition See the displays on pages 650 and 651. #### Example 20.2: Confidence Interval and Hypothesis Test for a Slope ```{r} mosaic::msummary(bodyfatlm) mean <- 1.70 se <- .074 tstats <- qt(p = c(.025, .975), df = 248) tstats mean + tstats * se t <- (mean - 0.00) / se t ``` ### Section 20.4: The Regression Table ```{r} # Table 20.1, page 654 mosaic::msummary(bodyfatlm) broom::tidy(bodyfatlm) # an alternative way to display the model ``` ### Section 20.5: Multiple Regression Inference ```{r} # Table 20.2, page 655 bodyfatmlm <- lm(pct_bf ~ waist + height, data = BodyFat) msummary(bodyfatmlm) ``` #### Just Checking ```{r} #| message: false Mouth <- read_csv("http://nhorton.people.amherst.edu/is5/data/Mouth_volume.csv") mouthlm <- lm(Mouth_Volume ~ Height, data = Mouth) # simple linear model df_stats(~ Mouth_Volume, data = Mouth) msummary(mouthlm) mouthmlm <- lm(Mouth_Volume ~ Age + Height, data = Mouth) # multiple linear model msummary(mouthmlm) ``` #### Collinearity ```{r} #| message: false Coasters <- read_csv("http://nhorton.people.amherst.edu/is5/data/Coasters_2015.csv") nrow(Coasters) Coasters <- Coasters |> filter(Name != "Tower of Terror", Name != "Xcelerator") |> # Removed artificially accelerated coasters and Tower of Terror filter(Drop != "NA", Duration != "NA") |> mutate(Inversions = as.factor(Inversions)) nrow(Coasters) coasterlm <- lm(Duration ~ Drop, data = Coasters) # simple linear model mosaic::msummary(coasterlm) coastermlm <- lm(Duration ~ Drop + Speed, data = Coasters) # multiple linear regression model mosaic::msummary(coastermlm) ``` ```{r} gf_point(Speed ~ Drop, data = Coasters) |> gf_lm() ``` ### Section 20.6: Confidence and Prediction Intervals ```{r} #| echo: false # warning: false added to deal with linewidth ``` ```{r} #| warning: false # Figure 20.16, page 659 gf_point(pct_bf ~ waist, data = BodyFat) |> gf_lm(interval = "confidence", fill = "red") |> gf_lm(interval = "prediction", fill = "navy") |> gf_labs(x = "Waist (in.)", y = "% of Body Fat") ``` ### Section 20.7: Logistic Regression ```{r} #| message: false PimaIndians <- read_csv("http://nhorton.people.amherst.edu/is5/data/Pima_indians.csv") |> filter(BMI != 0) # Figure 20.17, page 661 PimaIndians |> mutate(Diabetes = ifelse(Diabetes == 1, "Diabetic", "Not Diabetic")) |> gf_boxplot(BMI ~ as.factor(Diabetes), xlab = "Diabetes (1 = yes)") # Figure 20.21, page 663 gf_point(Diabetes ~ BMI, data = PimaIndians, ylab = "Diabetes (1 = yes)") |> gf_smooth(method = "glm", method.args = list(family = "binomial")) ``` ### Section 20.8: More About Regression