From 2be915ad48545ec800ac06bf718a8657d62bad8e Mon Sep 17 00:00:00 2001 From: Michael McIsaac Date: Mon, 31 Oct 2022 15:23:04 -0300 Subject: [PATCH 1/6] sample_props_small often has fewer than the requested 25 elements. The call to filter(scientist_work == "Doesn't benefit") is filtering out any replicates where there are no "Doesn't benefit"s in the small sample. As a result any replicates with p_hat=0 are filtered out and are not displayed. This issue is caused by using a small sample size and a true proportion close to 0 (p=.2). I have replaced this filtering code with the following group_by(replicate)%>% summarize(p_hat = mean(scientist_work=="Doesn't benefit")) Fixes #107. --- .../sampling_distributions.Rmd | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/05a_sampling_distributions/sampling_distributions.Rmd b/05a_sampling_distributions/sampling_distributions.Rmd index 8439ae4..f6fa692 100644 --- a/05a_sampling_distributions/sampling_distributions.Rmd +++ b/05a_sampling_distributions/sampling_distributions.Rmd @@ -114,9 +114,7 @@ samp1 %>% ```{r inline-calc, include=FALSE} # For use inline below samp1_p_hat <- samp1 %>% - count(scientist_work) %>% - mutate(p_hat = n /sum(n)) %>% - filter(scientist_work == "Doesn't benefit") %>% + summarize(p_hat = mean(scientist_work=="Doesn't benefit")) %>% pull(p_hat) %>% round(2) ``` @@ -138,15 +136,14 @@ Not surprisingly, every time you take another random sample, you might get a dif It's useful to get a sense of just how much variability you should expect when estimating the population mean this way. The distribution of sample proportions, called the *sampling distribution (of the proportion)*, can help you understand this variability. In this lab, because you have access to the population, you can build up the sampling distribution for the sample proportion by repeating the above steps many times. -Here, we use R to take 15,000 different samples of size 50 from the population, calculate the proportion of responses in each sample, filter for only the *Doesn't benefit* responses, and store each result in a vector called `sample_props50`. +Here, we use R to take 15,000 different samples of size 50 from the population, calculate the proportion of responses in each sample, count the *Doesn't benefit* responses, and store each result in a vector called `sample_props50`. Note that we specify that `replace = TRUE` since sampling distributions are constructed by sampling with replacement. ```{r iterate} sample_props50 <- global_monitor %>% rep_sample_n(size = 50, reps = 15000, replace = TRUE) %>% - count(scientist_work) %>% - mutate(p_hat = n /sum(n)) %>% - filter(scientist_work == "Doesn't benefit") + group_by(replicate)%>% + summarize(n = sum(scientist_work=="Doesn't benefit"), p_hat = mean(scientist_work=="Doesn't benefit")) ``` And we can visualize the distribution of these proportions with a histogram. @@ -179,9 +176,7 @@ We would have to manually run the following code 15,000 times ```{r sample-code} global_monitor %>% sample_n(size = 50, replace = TRUE) %>% - count(scientist_work) %>% - mutate(p_hat = n /sum(n)) %>% - filter(scientist_work == "Doesn't benefit") + summarize(n = sum(scientist_work=="Doesn't benefit"), p_hat = mean(scientist_work=="Doesn't benefit")) ``` as well as store the resulting sample proportions each time in a separate vector. @@ -326,4 +321,4 @@ You are welcome to use the app for exploration. ------------------------------------------------------------------------ -![Creative Commons License](https://i.creativecommons.org/l/by-sa/4.0/88x31.png){style="border-width:0"}
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. +![Creative Commons License](https://i.creativecommons.org/l/by-sa/4.0/88x31.png){style="border-width:0"}
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. \ No newline at end of file From 9d7fabbef00b2dd322ec48d3cebdfc30c9324dfe Mon Sep 17 00:00:00 2001 From: Michael McIsaac Date: Mon, 31 Oct 2022 16:03:57 -0300 Subject: [PATCH 2/6] Fixes #107 in the shinyApp. Replaces code based on filtering (which breaks down in the edge case where teh sample proporiton is 0, since there is then nothing to filter on) with code based on group_by + summarize. --- 05a_sampling_distributions/sampling_distributions.Rmd | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/05a_sampling_distributions/sampling_distributions.Rmd b/05a_sampling_distributions/sampling_distributions.Rmd index f6fa692..8ad13cb 100644 --- a/05a_sampling_distributions/sampling_distributions.Rmd +++ b/05a_sampling_distributions/sampling_distributions.Rmd @@ -251,9 +251,8 @@ shinyApp( sampling_dist <- reactive({ global_monitor %>% rep_sample_n(size = input$n_samp, reps = input$n_rep, replace = TRUE) %>% - count(scientist_work) %>% - mutate(p_hat = n /sum(n)) %>% - filter(scientist_work == input$outcome) + group_by(replicate)%>% + summarize(p_hat = mean(scientist_work==input$outcome)) }) # plot sampling distribution From 6d32fb79d6befd379bac1776fd3f1a09d1d98021 Mon Sep 17 00:00:00 2001 From: Michael McIsaac Date: Mon, 14 Nov 2022 14:06:45 -0400 Subject: [PATCH 3/6] Fixing error in lab 06 "Inference for categorical data" that caused the lab to crash: "Quitting from lines 70-76 (inf_for_categorical_data.Rmd) Warning: Error in : A proportion is not well-defined for a multinomial categorical response variable (text_ind) and no explanatory variable." text_ind contained "NA"s in addition to "yes"s and "no"s. This commit filters to complete cases (to eliminate the "NA"s). --- 06_inf_for_categorical_data/inf_for_categorical_data.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/06_inf_for_categorical_data/inf_for_categorical_data.Rmd b/06_inf_for_categorical_data/inf_for_categorical_data.Rmd index 2c0e6c8..14d797b 100644 --- a/06_inf_for_categorical_data/inf_for_categorical_data.Rmd +++ b/06_inf_for_categorical_data/inf_for_categorical_data.Rmd @@ -68,6 +68,7 @@ The inferential tools for estimating population proportion are analogous to thos ```{r nohelmet-text-ci} no_helmet %>% + filter(complete.cases(text_ind)) %>% specify(response = text_ind, success = "yes") %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "prop") %>% From 0f13e288d53f2e8d8f8015a6c532075d5f146b44 Mon Sep 17 00:00:00 2001 From: Michael McIsaac Date: Tue, 15 Nov 2022 20:59:20 -0400 Subject: [PATCH 4/6] Editing questions in lab 6 to avoid confusing/misleading exercises. Fixes #111. Fixes #99. --- 06_inf_for_categorical_data/inf_for_categorical_data.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/06_inf_for_categorical_data/inf_for_categorical_data.Rmd b/06_inf_for_categorical_data/inf_for_categorical_data.Rmd index 14d797b..3fba452 100644 --- a/06_inf_for_categorical_data/inf_for_categorical_data.Rmd +++ b/06_inf_for_categorical_data/inf_for_categorical_data.Rmd @@ -41,7 +41,7 @@ The dataset is called `yrbss`. 1. What are the counts within each category for the amount of days these students have texted while driving within the past 30 days? -2. What is the proportion of people who have texted while driving every day in the past 30 days and never wear helmets? +2. What is the proportion of people who have texted while driving every day in the past 30 days among those who never wear helmets? Remember that you can use `filter` to limit the dataset to just non-helmet wearers. Here, we will name the dataset `no_helmet`. @@ -179,7 +179,7 @@ For some of the exercises below, you will conduct inference comparing two propor In such cases, you have a response variable that is categorical, and an explanatory variable that is also categorical, and you are comparing the proportions of success of the response variable across the levels of the explanatory variable. This means that when using `infer`, you need to include both variables within `specify`. -1. Is there convincing evidence that those who sleep 10+ hours per day are more likely to strength train every day of the week? +1. Is there convincing evidence that those who sleep 10+ hours per day are more likely to strength train every day of the week than those who don't sleep 10+ hours per day? As always, write out the hypotheses for any tests you conduct and outline the status of the conditions for inference. If you find a significant difference, also quantify this difference with a confidence interval. From 7946a9c8af8d1ea57af22b34ec4b049b6ab96a23 Mon Sep 17 00:00:00 2001 From: Michael McIsaac Date: Mon, 21 Nov 2022 13:49:35 -0400 Subject: [PATCH 5/6] Avoiding error caused by NAs by filtering on complete cases. --- 07_inf_for_numerical_data/inf_for_numerical_data.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/07_inf_for_numerical_data/inf_for_numerical_data.Rmd b/07_inf_for_numerical_data/inf_for_numerical_data.Rmd index 58adcf7..fc59bf7 100644 --- a/07_inf_for_numerical_data/inf_for_numerical_data.Rmd +++ b/07_inf_for_numerical_data/inf_for_numerical_data.Rmd @@ -110,6 +110,7 @@ But first, we need to initialize the test, which we will save as `obs_diff`. ```{r inf-weight-habit-ht-initial, tidy=FALSE, warning = FALSE} obs_diff <- yrbss %>% + filter(complete.cases(physical_3plus)) %>% specify(weight ~ physical_3plus) %>% calculate(stat = "diff in means", order = c("yes", "no")) ``` @@ -124,6 +125,7 @@ We will save the permutation distribution as `null_dist`. ```{r inf-weight-habit-ht-null, tidy=FALSE, warning = FALSE} null_dist <- yrbss %>% + filter(complete.cases(physical_3plus)) %>% specify(weight ~ physical_3plus) %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% From 18958f11d2126e09fceb949b25a6f2b55f6129a0 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Mon, 21 Nov 2022 17:52:10 +0000 Subject: [PATCH 6/6] Re-build Rmd --- 01_intro_to_r/intro_to_r.html | 32 ++++---- 02_intro_to_data/intro_to_data.html | 32 ++++---- 03_probability/probability.html | 32 ++++---- .../normal_distribution.html | 73 ++++++++++--------- .../inf_for_numerical_data.html | 46 +++++++----- 08_simple_regression/simple_regression.html | 32 ++++---- .../multiple_regression.html | 32 ++++---- 7 files changed, 156 insertions(+), 123 deletions(-) diff --git a/01_intro_to_r/intro_to_r.html b/01_intro_to_r/intro_to_r.html index 893691e..c23783f 100644 --- a/01_intro_to_r/intro_to_r.html +++ b/01_intro_to_r/intro_to_r.html @@ -1295,6 +1295,7 @@ + + + + + + +