diff --git a/.RData b/.RData new file mode 100644 index 0000000..13950b0 Binary files /dev/null and b/.RData differ diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..881b825 --- /dev/null +++ b/.Rhistory @@ -0,0 +1,512 @@ +colnames(diamonds) +head(diamonds) +qplot(diamonds$carat, diamonds$price) +colnames(diamonds) +diamonds %>% +qplot(carat, price) +diamonds %>% +ggplot(carat, price) +diamonds %>% +ggplot("carat", "price") +diamonds %>% +qplot("carat", "price") +data <- diamonds +data <- diamonds +data %>% +qplot("carat", "price") +library(tidyverse) +head(diamonds) +qplot(diamonds$carat, diamonds$price) +qplot(diamonds$price) +qplot(data = diamonds, price) +qplot(data = diamonds, carat, price) +# Piping does not work for qplot, because the first argument of qplot is NOT data!! The first argument of qplot is "x". If you use ggplot, then you can pipe, but then it has different syntax and has aes. +qplot(data = diamonds, +x = carat, +y = price, +shape = clarity, +color = cut) +d <- ggplot(diamonds, aes(x=carat, y=price)) # first you set the aesthetic and dataset +d + geom_point() # then you add geoms +d + geom_point(aes(colour = carat)) # and you can keep doing this to add layers to the plot +d <- ggplot(diamonds, aes(x = carat, y = price)) # first you set the aesthetic and dataset +d + geom_point() # then you add geoms +d + geom_point(aes(colour = carat)) # and you can keep doing this to add layers to the plot +#qplot(data = diamonds, +# x = carat, +# y = price, +# shape = clarity, +# color = cut) +ggplot(data = diamonds +mapping = aes(x = carat, y = price) +#qplot(data = diamonds, +# x = carat, +# y = price, +# shape = clarity, +# color = cut) +ggplot(data = diamonds, +mapping = aes(x = carat, y = price) +) +#qplot(data = diamonds, +# x = carat, +# y = price, +# shape = clarity, +# color = cut) +ggplot(data = diamonds, +mapping = aes(x = "carat", y = "price") +) +#qplot(data = diamonds, +# x = carat, +# y = price, +# shape = clarity, +# color = cut) +ggplot(data = diamonds, +mapping = aes(x = carat, y = price) +) +#qplot(data = diamonds, +# x = carat, +# y = price, +# shape = clarity, +# color = cut) +ggplot(data = diamonds, +mapping = aes(x = carat, y = price) ++ geom_point() +) +d <- ggplot(diamonds, aes(x = carat, y = price)) # first you set the aesthetic and dataset +d + geom_point() # then you add geoms +# d + geom_point(aes(colour = carat)) # and you can keep doing this to add layers to the plot +#qplot(data = diamonds, +# x = carat, +# y = price, +# shape = clarity, +# color = cut) +d <- ggplot(diamonds, aes(x = carat, y = price)) +d + geom_point() +# ggplot(data = diamonds, +# mapping = aes(x = carat, y = price) +# + geom_point() +) +#qplot(data = diamonds, +# x = carat, +# y = price, +# shape = clarity, +# color = cut) +ggplot(diamonds, +aes(x = carat, y = price, +shape = clarity, +color = cut)) + +geom_point() +# d + geom_point() +# ggplot(data = diamonds, +# mapping = aes(x = carat, y = price) +# + geom_point() +) +ggplot(diamonds, +aes(x = carat, y = price, +shape = clarity, +color = cut)) + +geom_point() + +facet_grid(clarity ~ cut) +ggplot(diamonds, +aes(x = carat, y = price, +color = cut)) + +geom_point() + +facet_grid(clarity ~ cut) +ggplot(diamonds, aes(x=carat, y=price)) + +geom_point(shape = ".") + +facet_grid(cut ~ clarity) +ggplot(diamonds, aes(x=carat, y=price)) + +geom_point(shape = ".") + +facet_grid(cut ~ clarity) + +geom_smooth() +ggplot(diamonds, aes(x=carat, y=price)) + +geom_point(shape = ".") + +facet_grid(cut ~ clarity) + +# geom_smooth() +ggplot(diamonds, aes(x=carat, y=price)) + +geom_point(shape = ".") + +facet_grid(cut ~ clarity) # + +# geom_smooth() +ggplot(diamonds, aes(x=carat, y=price)) + +geom_point(shape = ".") + +facet_grid(cut ~ clarity) + +geom_smooth() +ggplot(diamonds, aes(x=carat, y=price)) + +geom_histogram(binwidth = "5") + +facet_grid(cut ~ clarity) +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = "5") + +facet_grid(cut ~ clarity) +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) + +facet_grid(cut ~ clarity) +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) + +facet_grid(cut ~ clarity) + +geom_density() +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) + +# facet_grid(cut ~ clarity) + +geom_density() +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) + +# facet_grid(cut ~ clarity) + +# geom_density() +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) # + +# facet_grid(cut ~ clarity) + +# geom_density() +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) + +facet_grid(cut ~ clarity) + +geom_density() +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) + +facet_grid(cut ~ clarity) + +geom_density() +ggplot(diamonds, aes(x=carat)) + +geom_histogram(binwidth = 5) + +facet_grid(cut ~ clarity) + +geom_density() +ggplot(diamonds, aes(x=carat, y=price)) + +geom_point(shape = ".") + +facet_grid(cut ~ clarity) + +geom_smooth() +library(tidyverse) +# two helper functions +sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))} +ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation +d <- read_csv("data/Ontogeny_fairness_seven_societies_data.csv", +na = c("NA", ".")) # they use . to indicate NA +facVars <- c("eq.uneq", "value", "decision") +d[, facVars] <- lapply(d[, facVars], factor) +d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial)) +facVars <- c("eq.uneq", "value", "decision") +d[, facVars] <- lapply(d[, facVars], factor) +d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial)) +d$trial_type <- factor(d$eq.uneq, +levels = c("E","U"), +labels = c("Equal","Unequal")) +d$condition <- factor(d$condition, +levels = c("AI","DI"), +labels = c("Advantageous","Disadvantageous")) +library(tidyverse) +# two helper functions +sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))} +ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation +d <- read_csv("data/Ontogeny_fairness_seven_societies_data.csv", +na = c("NA", ".")) # they use . to indicate NA +facVars <- c("eq.uneq", "value", "decision") +d[, facVars] <- lapply(d[, facVars], factor) +d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial)) +d$trial_type <- factor(d$eq.uneq, +levels = c("E","U"), +labels = c("Equal","Unequal")) +d$condition <- factor(d$condition, +levels = c("AI","DI"), +labels = c("Advantageous","Disadvantageous")) +d$trial_type <- factor(d$eq.uneq, +levels = c("E","U"), +labels = c("Equal","Unequal")) +d$condition <- factor(d$condition, +levels = c("AI","DI"), +labels = c("Advantageous","Disadvantageous")) +colnames(d) +View(d) +colnames(d) +chart1 <- d %>% +group_by(country, actor.age.years) %>% +summarize() +chart1 <- d %>% +group_by(country, actor.age.years) %>% +summarize() +chart1 <- d %>% +group_by(country, actor.age.years) %>% +summarize(count()) +chart1 <- d %>% +group_by(country, actor.age.years) %>% +summarize(l = length()) +chart1 <- d %>% +group_by(country) %>% +summarize(number_n = n()) +chart1 <- d %>% +group_by(country) %>% +summarize(number_n = mean()) +colnames(d) +chart1 <- d %>% +group_by(country) %>% +summarize(count_n = count(decision)) +chart1 <- d %>% +group_by(country) %>% +summarize(decision_n = count(decision)) +chart1 <- d %>% +group_by(country) %>% +summarize(decision, Count = n ()) +chart1 <- d %>% +group_by(country) %>% +summarize(decision, Count = n ()) +chart1 <- d %>% +group_by(country) %>% +summarize(Count = n ()) +chart1 <- d %>% +group_by(country) %>% +summarize(count = n ()) +colnames(d) +library(tidyverse) +# two helper functions +sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))} +ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation +d <- read_csv("data/Ontogeny_fairness_seven_societies_data.csv", +na = c("NA", ".")) # they use . to indicate NA +facVars <- c("eq.uneq", "value", "decision") +d[, facVars] <- lapply(d[, facVars], factor) +d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial)) +d$trial_type <- factor(d$eq.uneq, +levels = c("E","U"), +labels = c("Equal","Unequal")) +d$condition <- factor(d$condition, +levels = c("AI","DI"), +labels = c("Advantageous","Disadvantageous")) +colnames(d) +chart1 <- d %>% +group_by(country) %>% +summarize(count = n ()) +library(tidyverse) +# two helper functions +sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))} +ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation +colnames(d) +chart1 <- d %>% +group_by(country) %>% +summarize(count = n ()) +chart1 <- d %>% +group_by(country) %>% +summarize(count = n ()) +d %>% +group_by(country) %>% +summarize(count = n ()) +country +d %>% +group_by(country, decision) %>% +summarize(count = n ()) +colnames(d) +d %>% +group_by(country, decision, eq.uneq) %>% +summarize(count = n ()) +d %>% +group_by(country, eq.uneq, decision) %>% +summarize(count = n ()) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +colnames(d) +d %>% +group_by(country, eq.uneq, decision) %>% +summarize(count = n ()) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +?facet +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +facet_wrap(country) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +facet_wrap(~country) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +facet_wrap(~eq.uneq) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +facet_wrap(country~eq.uneq) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(country~eq.uneq) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +colnames(d) +d <- d %>% +filter(eq.uneq != "NA" & decision != "NA") +colnames(d) +d <- d %>% +filter(eq.uneq != "NA" & decision != "NA") +d %>% +group_by(country, eq.uneq, decision) %>% +summarize(count = n ()) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +colnames(d) +colnames(d) +d <- d %>% +filter(eq.uneq != "NA" & decision != "NA") +d %>% +group_by(country, eq.uneq, decision) %>% +summarize(count = n ()) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +d <- d %>% +group_by(country) %>% +mutate() +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +?mutate +d <- d %>% +mutate(mean(decision == accept)) +d <- d %>% +mutate(mean(decision == "accept")) +colnames(d) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +d <- d %>% +mutate(percent_accept <- mean(decision == "accept")) +colnames(d) +d <- d %>% +mutate(mean(decision == "accept")) +colnames(d) +colnames(d) +d <- d %>% +mutate(foo = mean(decision == "accept")) +d <- d %>% +mutate(percentage_accept = mean(decision == "accept")) +colnames(d) +library(tidyverse) +# two helper functions +sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))} +ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation +d <- read_csv("data/Ontogeny_fairness_seven_societies_data.csv", +na = c("NA", ".")) # they use . to indicate NA +facVars <- c("eq.uneq", "value", "decision") +d[, facVars] <- lapply(d[, facVars], factor) +d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial)) +d$trial_type <- factor(d$eq.uneq, +levels = c("E","U"), +labels = c("Equal","Unequal")) +d$condition <- factor(d$condition, +levels = c("AI","DI"), +labels = c("Advantageous","Disadvantageous")) +d <- d %>% +filter(eq.uneq != "NA" & decision != "NA") +d %>% +group_by(country, eq.uneq, decision) %>% +summarize(count = n ()) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +d <- d %>% +mutate(percentage_accept = mean(decision == "accept")) +head(d) +head(d) +d <- d %>% +group_by(country) %>% +mutate(percentage_accept = mean(decision == "accept")) +head(d) +tail(d) +ggplot(d, aes(x = percentage_accept)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +colnames(d) +ggplot(d, aes(x = country)) + +geom_histogram(stat = "count") + +facet_wrap(~percentage_accept) +ggplot(d, aes(x = country)) + +geom_histogram(stat = "count") + +facet_wrap(~actor.age.years) +ggplot(d, aes(x = actor.age.years)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +ggplot(d, aes(x = actor.age.years)) + +geom_smooth + +facet_wrap(~country) +ggplot(d, aes(x = actor.age.years)) + +geom_smooth() + +facet_wrap(~country) +ggplot(d, aes(x = actor.age.years, y = percentage_accept)) + +geom_smooth() + +facet_wrap(~country) +d <- d %>% +group_by(country, actor.age.years) %>% +mutate(percentage_accept = mean(decision == "accept")) +ggplot(d, aes(x = actor.age.years, y = percentage_accept)) + +geom_smooth() + +facet_wrap(~country) +d <- d %>% +group_by(country, actor.age.years, eq.uneq) %>% +mutate(percentage_accept = mean(decision == "accept")) +ggplot(d, aes(x = actor.age.years, y = percentage_accept)) + +geom_smooth() + +facet_wrap(~country) +ggplot(d, aes(x = actor.age.years, y = percentage_accept)) + +geom_smooth() + +facet_wrap(eq.uneq ~country) +?geom_smooth +ggplot(d, aes(x = actor.age.years, y = country)) + +geom_line() + +facet_wrap(eq.uneq ~country) +ggplot(d, aes(x = actor.age.years)) + +geom_line() + +facet_wrap(eq.uneq ~country) +ggplot(d, aes(x = actor.age.years, y = percent_accept, color = country)) + +geom_smooth() + +facet_wrap(eq.uneq ~country) +library(tidyverse) +# two helper functions +sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))} +ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation +d <- read_csv("data/Ontogeny_fairness_seven_societies_data.csv", +na = c("NA", ".")) # they use . to indicate NA +facVars <- c("eq.uneq", "value", "decision") +d[, facVars] <- lapply(d[, facVars], factor) +d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial)) +d$trial_type <- factor(d$eq.uneq, +levels = c("E","U"), +labels = c("Equal","Unequal")) +d$condition <- factor(d$condition, +levels = c("AI","DI"), +labels = c("Advantageous","Disadvantageous")) +d <- d %>% +filter(eq.uneq != "NA" & decision != "NA") +d %>% +group_by(country, eq.uneq, decision) %>% +summarize(count = n ()) +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") +ggplot(d, aes(x = decision)) + +geom_histogram(stat = "count") + +facet_wrap(~country) +d <- d %>% +group_by(country, actor.age.years, eq.uneq) %>% +mutate(percentage_accept = mean(decision == "accept")) +head(d) +tail(d) +ggplot(d, aes(x = country)) + +geom_histogram(stat = "count") + +facet_wrap(~actor.age.years) +colnames(d) +ggplot(d, aes(x = actor.age.years, y = percentage_accept)) + +geom_smooth() + +facet_wrap(eq.uneq ~country) +ggplot(d, aes(x = actor.age.years, y = percentage_accept, color = country)) + +geom_smooth() + +facet_wrap(eq.uneq ~country) +ggplot(d, aes(x = actor.age.years, y = percentage_accept, color = country)) + +geom_smooth() + +facet_wrap(~eq.uneq) +d <- d %>% +mutate(percentage_accept = mean(decision == "accept")) +ggplot(d, aes(x = actor.age.years, y = percentage_accept, color = country)) + +geom_smooth() + +facet_wrap(~eq.uneq) +ggplot(d, aes(x = actor.age.years, y = percentage_accept, color = country)) + +geom_smooth() + +facet_wrap(~eq.uneq) diff --git a/Ceballos_ggplot2_exercise.Rmd b/Ceballos_ggplot2_exercise.Rmd new file mode 100644 index 0000000..7ce6175 --- /dev/null +++ b/Ceballos_ggplot2_exercise.Rmd @@ -0,0 +1,182 @@ +--- +title: 'ggplot2 intro exercise' +author: "Cristina Ceballos" +date: "November 13, 2019" +output: html_document +--- + +This is an in-class exercise on using `ggplot2`. + +Note, that this example is from the_grammar.R on [http://had.co.nz/ggplot2](). I've adapted this for psych 251 purposes + +First install and load the package. It's part of the "core tidvyerse". + +```{r} +library(tidyverse) +``` + +# Exploring ggplot2 using `qplot` + +We'll start by using `qplot`. `qplot` is the easy interface, meant to replace `plot`. You can give it simple `qplot(x,y)` examples, or slightly more complex examples like `qplot(x, y, col=grp, data=d)`. + +We're going to be using the `diamonds` dataset. This is a set of measurements of diamonds, along with their price etc. + +```{r} +head(diamonds) +qplot(diamonds$carat, diamonds$price) +qplot(diamonds$price) +colnames(diamonds) +``` + +Scatter plots are trivial, and easy to add features to. Modify this plot so that it uses the dataframe rather than working from variables in the general namespace (good to get away from retyping `diamonds$` every time you reference a variable). + +```{r} + +qplot(data = diamonds, carat, price) + + +# Piping does not work for qplot, because the first argument of qplot is NOT data!! The first argument of qplot is "x". If you use ggplot, then you can pipe, but then it has different syntax and has aes. + +``` + +Try adding clarity and cut, using shape and color as your visual variables. + +```{r} + + +qplot(data = diamonds, + x = carat, + y = price, + shape = clarity, + color = cut) + +``` + + +# More complex with `ggplot` + +`ggplot` is just a way of building `qplot` calls up more systematically. It's +sometimes easier to use and sometimes a bit more complicated. What I want to show off here is the functionality of being able to build up complex plots with multiple elements. You can actually do this using `qplot` pretty easily, but there are a few things that are hard to do. + +`ggplot` is the basic call, where you specify A) a dataframe and B) an aesthetic (`aes`) mapping from variables in the plot space to variables in the dataset. + +```{r} +d <- ggplot(diamonds, aes(x = carat, y = price)) # first you set the aesthetic and dataset +d + geom_point() # then you add geoms +# d + geom_point(aes(colour = carat)) # and you can keep doing this to add layers to the plot +``` + +Try writing this as a single set of additions (e.g. one line of R code, though you can put in linebreaks). This is the most common workflow for me. + + +```{r} + +``` + + +# Facets + +Let's try out another version of the `qplot` example above. Rewrite the last qplot example with ggplot. + +```{r} + +#qplot(data = diamonds, + # x = carat, + # y = price, + # shape = clarity, + # color = cut) + +ggplot(diamonds, + aes(x = carat, y = price, + shape = clarity, + color = cut)) + + geom_point() + +# d + geom_point() + +# ggplot(data = diamonds, + # mapping = aes(x = carat, y = price) + # + geom_point() + # ) + +``` + +One of the primary benefits of `ggplot2` is the use of facets - also known as small multiples in the Tufte vocabulary. That last plot was probably hard to read. Facets could make it better. Try adding `facet_grid(x ~ y)`. `x ~ y` means row facets are by `x`, column facets by `y`. + +```{r} + +ggplot(diamonds, + aes(x = carat, y = price, + color = cut)) + + geom_point() + + facet_grid(clarity ~ cut) + +# Facet statement has two variables: clarity and cut. Cut is fair, good, premium, ideal. it tells you what to make the columsn. The rows are the cut. + +# If you wanted just all the cuts. clarity is "x", cut is "y". Gives you a bunch of mini-plots. + +# Otherwise, you can switch to facet_wrap. And it will wrap it around for the most pleasing grid of subplots. +# Facet_wrap(~clarity) + +# When you have discrete variables, and you want to look at the effect in some continuous variable space. That's when facet is really useful. + +# To export these kinds of plots, you can use ggssave("~/Desktop/Diamons.pdf", plot = p) + +# You can also export it. Plot it to the plotting window, and use export. But this is less reproducible. + + +``` + +But facets can also get overwhelming. Try to strike a good balance between color, shape, and faceting. + +HINT: `facet_grid(. ~ x)` puts x on the columns, but `facet_wrap(~ x)` (no dot) *wraps* the facets. + +```{r} + +``` + + +# Geoms + +As you've seen above, the basic unit of a ggplot plot is a "geom" - a mapping between data (via an "aesthetic") and a particular geometric configuration on coordinate axes. + +Let's try adding some geoms and manipulating their parameters. One combo I really like is a scatterplot with a smoothing layer (`geom_smooth`). Try adding one onto this plot. + +```{r} +ggplot(diamonds, aes(x=carat, y=price)) + + geom_point(shape = ".") + + facet_grid(cut ~ clarity) + + geom_smooth() +``` + + +CHALLENGE: You could also try starting with a histogram and adding densities. (`geom_density`), Try [this link](https://stackoverflow.com/questions/5688082/ggplot2-overlay-histogram-with-density-curve). + +```{r} + +ggplot(diamonds, aes(x=carat)) + + geom_histogram(binwidth = 5) + + facet_grid(cut ~ clarity) + + geom_density() + + +``` + +# Themes and plot cleanup + +I like a slightly cleaner look to my plots. Luckily, ggplot allows you to add "themes" to your plots. Try doing the same plot but adding `+ theme_bw()` or `+ theme_classic()`. Different themes work better for different applications, in my experience. My favorite right now is `ggthemes::theme_few`. + +You can also try different color scales. Experiment with `scale_color_...` - try writing that and hitting TAB for autocomplete options. Check out the help on this. + +You can also try transparency/different point sizes to clean up scatterplots. Try `alpha = .1` or `pch = "."` to make transparent or small points. + +Finally, don't forget to "fix the axis labels"! + +Here's a somewhat ugly plot - see if you can make it look awesome. + +```{r} +ggplot(diamonds, aes(x = carat, y = price, col = cut)) + + geom_point() + + facet_wrap(~clarity) +``` + diff --git a/Ceballos_ps4.Rmd b/Ceballos_ps4.Rmd new file mode 100644 index 0000000..1a01ce3 --- /dev/null +++ b/Ceballos_ps4.Rmd @@ -0,0 +1,192 @@ +--- +title: 'Psych 251 PS4: Simulation' +author: "Cristina Ceballos" +date: "11/11/2019" +output: + html_document: + toc: true +--- + +This is problem set #4, in which we want you to integrate your knowledge of data wrangling with some basic simulation skills. It's a short problem set to help you get your feet wet in testing statistical concepts through "making up data" rather than consulting a textbook or doing math. + +For ease of reading, please separate your answers from our text by marking our text with the `>` character (indicating quotes). + +```{r, warning=F, message=F} +library(tidyverse) +``` + +Let's start by convincing ourselves that t-tests have the appropriate false positive rate. Run 10,000 t-tests with standard, normally-distributed data from a made up 30-person, single-measurement experiment (the command for sampling from a normal distribution is `rnorm`). + +The goal of these t-tests are to determine, based on 30 observations, whether the underlying distribution (in this case a normal distribution with mean 0 and standard deviation 1) has a mean that is different from 0. In reality, the mean is not different from 0 (we sampled it using `rnorm`), but sometimes the 30 observations we get in our experiment will suggest that the mean is higher or lower. In this case, we'll get a "significant" result and incorrectly reject the null hypothesis of mean 0. + +What's the proportion of "significant" results ($p < .05$) that you see? + +First do this using a `for` loop. + +```{r} + +result <- numeric(10000) + +for (i in 1:10000) { + X <- rnorm(30, 0, 1) + result[i] <- t.test(X)$p.value +} + +sig_results <- result < 0.05 + +sum(sig_results)/10000 + + +``` + +Next, do this using the `replicate` function: + +```{r} + +pvals <- replicate(10000, t.test(rnorm(30))$p.value) + +sig_pvals <- pvals < 0.05 + +sum(sig_pvals)/10000 + +``` + +How does this compare to the intended false-positive rate of $\alpha=0.05$? + +> This answer makes sense. I'm getting a false positive rate of about 5 percent or slightly less than 5 percent. That's what I would expect given that I am testing for signifiance of ($p < .05$). + +Ok, that was a bit boring. Let's try something more interesting - let's implement a p-value sniffing simulation, in the style of Simons, Nelson, & Simonsohn (2011). + +Consider this scenario: you have done an experiment, again with 30 participants (one observation each, just for simplicity). The question is whether the true mean is different from 0. You aren't going to check the p-value every trial, but let's say you run 30 - then if the p-value is within the range p < .25 and p > .05, you optionally run 30 more and add those data, then test again. But if the original p value is < .05, you call it a day, and if the original is > .25, you also stop. + +First, write a function that implements this sampling regime. + +```{r} +double.sample <- function () { + + first_sample <- rnorm(30) + + pvals_2 <- (t.test(first_sample)$p.value) + + if(pvals_2<0.05) { + return(pvals_2) + } else if (pvals_2>0.25) { + return(pvals_2) + } else { + second_sample <- c(first_sample, rnorm(30)) + return(t.test(second_sample)$p.value) + } + +} + + +``` + +Now call this function 10k times and find out what happens. + +```{r} + +tenk_results <- replicate(10000, double.sample()) + +sum(tenk_results < 0.05)/10000 + + +``` + +Is there an inflation of false positives? How bad is it? + +> Yes, there's an inflation of false positives. I'm getting about 7 percent false positives, using the double sample technique. This is what Mike talked about in class, with p-hacking. While I may falsely believe that I am keeping p-value at less than 0.05, the double sampling technique is instead giving me slightly higher false positives. + +Now modify this code so that you can investigate this "double the sample" rule in a bit more depth. In the previous question, the researcher doubles the sample only when they think they got "close" to a significant result, i.e. when their not-significant p is less than 0.25. What if the researcher was more optimistic? See what happens in these 3 other scenarios: + +* The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.5. +* The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.75. +* The research doubles their sample whenever they get ANY pvalue that is not significant. + +How do these choices affect the false positive rate? + +HINT: Try to do this by making the function `double.sample` take the upper p value as an argument, so that you can pass this through dplyr. + +HINT 2: You may need more samples. Find out by looking at how the results change from run to run. + +```{r} + +# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.5. + +double.sample0.5 <- function () { + + first_sample_0.05 <- rnorm(30) + + pvals_2_0.05 <- (t.test(first_sample_0.05)$p.value) + + if(pvals_2_0.05<0.05) { + return(pvals_2_0.05) + } else if (pvals_2_0.05>0.5) { + return(pvals_2_0.05) + } else { + second_sample_0.05 <- c(first_sample_0.05, rnorm(30)) + return(t.test(second_sample_0.05)$p.value) + } + +} + +tenk_results_0.05 <- replicate(10000, double.sample0.5()) + +sum(tenk_results_0.05 < 0.05)/10000 + + + +# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.75. + +double.sample0.75 <- function () { + + first_sample_0.75 <- rnorm(30) + + pvals_2_0.75 <- (t.test(first_sample_0.75)$p.value) + + + if(pvals_2_0.75<0.05) { + return(pvals_2_0.75) + } else if (pvals_2_0.75>0.75) { + return(pvals_2_0.75) + } else { + second_sample_0.75 <- c(first_sample_0.75, rnorm(30)) + return(t.test(second_sample_0.75)$p.value) + } + +} + +tenk_results_0.75 <- replicate(10000, double.sample0.75()) + +sum(tenk_results_0.75 < 0.05)/10000 + + + +# The researcher doubles the sample whenever their pvalue is not significant. + +double.sample_whenever <- function () { + + first_sample_whenever <- rnorm(30) + pvals_2_whenever <- (t.test(first_sample_whenever)$p.value) + + if(pvals_2_whenever<0.05) { + return(pvals_2_whenever) + } else { + second_sample_whenever <- c(first_sample_whenever, rnorm(30)) + return(t.test(second_sample_whenever)$p.value) + } + +} + +tenk_results_whenever <- replicate(10000, double.sample_whenever()) + +sum(tenk_results_whenever < 0.05)/10000 + + + + +``` + +What do you conclude on the basis of this simulation? How bad is this kind of data-dependent policy? + +> I see an inflation of false positives. The more liberal I am with my double-sampling policy, the higher the rate of false positives. For example, when I sampled again whenever, regardless of my p value, I got an 8 percent rate of false positives. This data-dependent double-sampling policy is leading to many more false positives, even though my putative p-value is set below 0.5 diff --git a/Ceballos_ps4.html b/Ceballos_ps4.html new file mode 100644 index 0000000..5c1d284 --- /dev/null +++ b/Ceballos_ps4.html @@ -0,0 +1,557 @@ + + + + + + + + + + + + + + + + +Psych 251 PS4: Simulation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

This is problem set #4, in which we want you to integrate your knowledge of data wrangling with some basic simulation skills. It’s a short problem set to help you get your feet wet in testing statistical concepts through “making up data” rather than consulting a textbook or doing math.

+

For ease of reading, please separate your answers from our text by marking our text with the > character (indicating quotes).

+
library(tidyverse)
+

Let’s start by convincing ourselves that t-tests have the appropriate false positive rate. Run 10,000 t-tests with standard, normally-distributed data from a made up 30-person, single-measurement experiment (the command for sampling from a normal distribution is rnorm).

+

The goal of these t-tests are to determine, based on 30 observations, whether the underlying distribution (in this case a normal distribution with mean 0 and standard deviation 1) has a mean that is different from 0. In reality, the mean is not different from 0 (we sampled it using rnorm), but sometimes the 30 observations we get in our experiment will suggest that the mean is higher or lower. In this case, we’ll get a “significant” result and incorrectly reject the null hypothesis of mean 0.

+

What’s the proportion of “significant” results (\(p < .05\)) that you see?

+

First do this using a for loop.

+
result <- numeric(10000)
+
+for (i in 1:10000) {
+  X <- rnorm(30, 0, 1)
+  result[i] <- t.test(X)$p.value
+}
+
+sig_results <- result < 0.05
+
+sum(sig_results)/10000
+
## [1] 0.0527
+

Next, do this using the replicate function:

+
pvals <- replicate(10000, t.test(rnorm(30))$p.value)
+
+sig_pvals <- pvals < 0.05
+
+sum(sig_pvals)/10000
+
## [1] 0.0504
+

How does this compare to the intended false-positive rate of \(\alpha=0.05\)?

+
+

This answer makes sense. I’m getting a false positive rate of about 5 percent or slightly less than 5 percent. That’s what I would expect given that I am testing for signifiance of (\(p < .05\)).

+
+

Ok, that was a bit boring. Let’s try something more interesting - let’s implement a p-value sniffing simulation, in the style of Simons, Nelson, & Simonsohn (2011).

+

Consider this scenario: you have done an experiment, again with 30 participants (one observation each, just for simplicity). The question is whether the true mean is different from 0. You aren’t going to check the p-value every trial, but let’s say you run 30 - then if the p-value is within the range p < .25 and p > .05, you optionally run 30 more and add those data, then test again. But if the original p value is < .05, you call it a day, and if the original is > .25, you also stop.

+

First, write a function that implements this sampling regime.

+
double.sample <- function () {
+  
+  first_sample <- rnorm(30)
+
+    pvals_2 <- (t.test(first_sample)$p.value)
+
+  if(pvals_2<0.05) {
+      return(pvals_2)
+  } else if (pvals_2>0.25) {
+      return(pvals_2) 
+  } else {
+      second_sample <- c(first_sample, rnorm(30))
+      return(t.test(second_sample)$p.value)
+  }
+
+}
+

Now call this function 10k times and find out what happens.

+
tenk_results <- replicate(10000, double.sample())
+
+sum(tenk_results < 0.05)/10000
+
## [1] 0.0763
+

Is there an inflation of false positives? How bad is it?

+
+

Yes, there’s an inflation of false positives. I’m getting about 7 percent false positives, using the double sample technique. This is what Mike talked about in class, with p-hacking. While I may falsely believe that I am keeping p-value at less than 0.05, the double sampling technique is instead giving me slightly higher false positives.

+
+

Now modify this code so that you can investigate this “double the sample” rule in a bit more depth. In the previous question, the researcher doubles the sample only when they think they got “close” to a significant result, i.e. when their not-significant p is less than 0.25. What if the researcher was more optimistic? See what happens in these 3 other scenarios:

+ +

How do these choices affect the false positive rate?

+

HINT: Try to do this by making the function double.sample take the upper p value as an argument, so that you can pass this through dplyr.

+

HINT 2: You may need more samples. Find out by looking at how the results change from run to run.

+
# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.5.
+
+double.sample0.5 <- function () {
+  
+  first_sample_0.05 <- rnorm(30)
+
+    pvals_2_0.05 <- (t.test(first_sample_0.05)$p.value)
+
+    if(pvals_2_0.05<0.05) {
+      return(pvals_2_0.05)
+  } else if (pvals_2_0.05>0.5) {
+      return(pvals_2_0.05) 
+  } else {
+      second_sample_0.05 <- c(first_sample_0.05, rnorm(30))
+      return(t.test(second_sample_0.05)$p.value)
+  }
+
+}
+
+tenk_results_0.05 <- replicate(10000, double.sample0.5())
+
+sum(tenk_results_0.05 < 0.05)/10000
+
## [1] 0.0803
+
# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.75.
+
+double.sample0.75 <- function () {
+  
+  first_sample_0.75 <- rnorm(30)
+
+    pvals_2_0.75 <- (t.test(first_sample_0.75)$p.value)
+
+  
+  if(pvals_2_0.75<0.05) {
+      return(pvals_2_0.75)
+  } else if (pvals_2_0.75>0.75) {
+      return(pvals_2_0.75) 
+  } else {
+      second_sample_0.75 <- c(first_sample_0.75, rnorm(30))
+      return(t.test(second_sample_0.75)$p.value)
+  }
+
+}
+
+tenk_results_0.75 <- replicate(10000, double.sample0.75())
+
+sum(tenk_results_0.75 < 0.05)/10000
+
## [1] 0.0821
+
# The researcher doubles the sample whenever their pvalue is not significant.
+
+double.sample_whenever <- function () {
+  
+  first_sample_whenever <- rnorm(30)
+    pvals_2_whenever <- (t.test(first_sample_whenever)$p.value)
+
+  if(pvals_2_whenever<0.05) {
+      return(pvals_2_whenever)
+  } else {
+      second_sample_whenever <- c(first_sample_whenever, rnorm(30))
+      return(t.test(second_sample_whenever)$p.value)
+  }
+
+}
+
+tenk_results_whenever <- replicate(10000, double.sample_whenever())
+
+sum(tenk_results_whenever < 0.05)/10000
+
## [1] 0.086
+

What do you conclude on the basis of this simulation? How bad is this kind of data-dependent policy?

+
+

I see an inflation of false positives. The more liberal I am with my double-sampling policy, the higher the rate of false positives. For example, when I sampled again whenever, regardless of my p value, I got an 8 percent rate of false positives. This data-dependent double-sampling policy is leading to many more false positives, even though my putative p-value is set below 0.5

+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/blake exercise.Rmd b/blake exercise.Rmd new file mode 100644 index 0000000..e3a95d3 --- /dev/null +++ b/blake exercise.Rmd @@ -0,0 +1,122 @@ +--- +title: 'Blake et al. (2015) exercise' +author: "Mike Frank" +date: "November 15, 2019" +output: + html_document: + toc: true +--- + +# Intro + +This is an in-class exercise exploring Blake et al. (2015), [Ontogeny of fairness in seven societies](http://www.nature.com/nature/journal/v528/n7581/full/nature15703.html), *Nature*. + +Please explore these data together (without looking at the analyses supplied by the authors). + +The overall goal is to understand the degree to which the data support the authors' hypotheses, and to make some more awesome plots along the way. + +```{r} +library(tidyverse) + +# two helper functions +sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))} +ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation +``` + +# Data Prep + +First read in the data, as distributed by the journal. + +```{r} +d <- read_csv("data/Ontogeny_fairness_seven_societies_data.csv", + na = c("NA", ".")) # they use . to indicate NA +``` + +Do some preprocessing, taken directly from the supplemental material. + +```{r} +facVars <- c("eq.uneq", "value", "decision") +d[, facVars] <- lapply(d[, facVars], factor) +d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial)) +``` + +Rename things so that they are easy to deal with. I hate hard to remember abbreviations for condition names. + +```{r} +d$trial_type <- factor(d$eq.uneq, + levels = c("E","U"), + labels = c("Equal","Unequal")) +d$condition <- factor(d$condition, + levels = c("AI","DI"), + labels = c("Advantageous","Disadvantageous")) +``` + +# Variable exploration + +Describe the dataset graphically in ways that are useful for you to get a handle on the data collection effort. + +Histograms are good. Ages of the participants are useful too. + +Remember your `group_by` + `summarise` workflow. This will help you here. + +```{r} + +d <- d %>% + filter(eq.uneq != "NA" & decision != "NA") + +d %>% + group_by(country, eq.uneq, decision) %>% + summarize(count = n ()) + + +ggplot(d, aes(x = decision)) + + geom_histogram(stat = "count") + + +ggplot(d, aes(x = decision)) + + geom_histogram(stat = "count") + + facet_wrap(~country) + + +d <- d %>% + mutate(percentage_accept = mean(decision == "accept")) + +head(d) +tail(d) + +ggplot(d, aes(x = country)) + + geom_histogram(stat = "count") + + facet_wrap(~actor.age.years) + + +``` + +Make sure you understand what the design was: how many trials per participant, what was between- and within-subject, etc. + +# Hypothesis-related exploration + +In this second, explore the authors' hypotheses related to advantageous and inadvantageous inequity aversion. Create 1 - 3 pictures that describe the support (or lack of it) for this hypothesis. + +```{r} + +colnames(d) + +ggplot(d, aes(x = actor.age.years, y = percentage_accept)) + + geom_smooth() + + facet_wrap(eq.uneq ~country) + + + +``` + + +```{r} + +ggplot(d, aes(x = actor.age.years, y = percentage_accept, color = country)) + + geom_smooth() + + facet_wrap(~eq.uneq) + + + +``` + diff --git a/blake-exercise.html b/blake-exercise.html new file mode 100644 index 0000000..2711a64 --- /dev/null +++ b/blake-exercise.html @@ -0,0 +1,487 @@ + + + + + + + + + + + + + + + + +Blake et al. (2015) exercise + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ +
+

Intro

+

This is an in-class exercise exploring Blake et al. (2015), Ontogeny of fairness in seven societies, Nature.

+

Please explore these data together (without looking at the analyses supplied by the authors).

+

The overall goal is to understand the degree to which the data support the authors’ hypotheses, and to make some more awesome plots along the way.

+
library(tidyverse)
+
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
+
## v ggplot2 3.2.1     v purrr   0.3.3
+## v tibble  2.1.3     v dplyr   0.8.3
+## v tidyr   1.0.0     v stringr 1.4.0
+## v readr   1.3.1     v forcats 0.4.0
+
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
+## x dplyr::filter() masks stats::filter()
+## x dplyr::lag()    masks stats::lag()
+
# two helper functions
+sem <- function(x) {sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))}
+ci95 <- function(x) {sem(x) * 1.96} # lazy normal approximation
+
+
+

Data Prep

+

First read in the data, as distributed by the journal.

+
d <- read_csv("data/Ontogeny_fairness_seven_societies_data.csv", 
+              na = c("NA", ".")) # they use . to indicate NA
+
## Parsed with column specification:
+## cols(
+##   condition = col_character(),
+##   country = col_character(),
+##   actor.id = col_character(),
+##   actor.age.years = col_double(),
+##   actor.gender = col_character(),
+##   dist = col_character(),
+##   eq.uneq = col_character(),
+##   decision = col_character(),
+##   value = col_character(),
+##   trial = col_character()
+## )
+

Do some preprocessing, taken directly from the supplemental material.

+
facVars <- c("eq.uneq", "value", "decision")
+d[, facVars] <- lapply(d[, facVars], factor)
+d$trial.number <- as.numeric(gsub(".(\\d+)", "\\1", d$trial))
+

Rename things so that they are easy to deal with. I hate hard to remember abbreviations for condition names.

+
d$trial_type <- factor(d$eq.uneq, 
+                       levels = c("E","U"), 
+                       labels = c("Equal","Unequal"))
+d$condition <- factor(d$condition,
+                      levels = c("AI","DI"), 
+                      labels = c("Advantageous","Disadvantageous"))
+
+
+

Variable exploration

+

Describe the dataset graphically in ways that are useful for you to get a handle on the data collection effort.

+

Histograms are good. Ages of the participants are useful too.

+

Remember your group_by + summarise workflow. This will help you here.

+

Make sure you understand what the design was: how many trials per participant, what was between- and within-subject, etc.

+
+ + + + + +
+ + + + + + + + + + + + + + + diff --git a/hw4 updated.Rmd b/hw4 updated.Rmd new file mode 100644 index 0000000..5ac51f0 --- /dev/null +++ b/hw4 updated.Rmd @@ -0,0 +1,251 @@ +--- +title: 'Psych 251 PS4: Simulation + Analysis' +author: "Cristina Ceballos" +date: "11/15/2019" +output: + pdf_document: + toc: yes + html_document: + toc: yes +--- + +This is problem set #4, in which we want you to integrate your knowledge of data wrangling with some basic simulation skills. It's a short problem set to help consolidate your `ggplot2` skills and then help you get your feet wet in testing statistical concepts through "making up data" rather than consulting a textbook or doing math. + +For ease of reading, please separate your answers from our text by marking our text with the `>` character (indicating quotes). + +# Part 1: ggplot practice + +This part is a warmup, it should be relatively straightforward `ggplot2` practice. + +Load data from Frank, Vul, Saxe (2011, Infancy), a study in which we measured infants' looking to hands in moving scenes. There were infants from 3 months all the way to about two years, and there were two movie conditions (`Faces_Medium`, in which kids played on a white background, and `Faces_Plus`, in which the backgrounds were more complex and the people in the videos were both kids and adults). An eye-tracker measured children's attention to faces. This version of the dataset only gives two conditions and only shows the amount of looking at hands (other variables were measured as well). + +```{r, warning=F, message=F} +library(tidyverse) +``` + + + +```{r} +fvs <- read_csv("data/FVS2011-hands.csv") + +colnames(fvs) +head(fvs) +``` + +First, use `ggplot` to plot a histogram of the ages of children in the study. NOTE: this is a repeated measures design, so you can't just take a histogram of every measurement. + +```{r} + + +fvs %>% + group_by(subid, age) %>% + summarise() + + +ggplot(fvs, aes(x=fvs$age)) + geom_histogram() + + +``` + +Second, make a scatter plot showing hand looking as a function of age and condition. Add appropriate smoothing lines. Take the time to fix the axis labels and make the plot look nice. + +```{r} + +ggplot(fvs, aes(x=fvs$age, y=fvs$hand.look)) + + geom_point(shape = ".") + + geom_smooth() + + +``` + +What do you conclude from this pattern of data? + +> ANSWER HERE?? Around 15 months, infants start paying more attention to hands. + +What statistical analyses would you perform here to quantify these differences? + +> ANSWER HERE. ???? + + +# Part 2: Simulation + +```{r, warning=F, message=F} +library(tidyverse) +``` + + +Let's start by convincing ourselves that t-tests have the appropriate false positive rate. Run 10,000 t-tests with standard, normally-distributed data from a made up 30-person, single-measurement experiment (the command for sampling from a normal distribution is `rnorm`). + +The goal of these t-tests are to determine, based on 30 observations, whether the underlying distribution (in this case a normal distribution with mean 0 and standard deviation 1) has a mean that is different from 0. In reality, the mean is not different from 0 (we sampled it using `rnorm`), but sometimes the 30 observations we get in our experiment will suggest that the mean is higher or lower. In this case, we'll get a "significant" result and incorrectly reject the null hypothesis of mean 0. + +What's the proportion of "significant" results ($p < .05$) that you see? + +First do this using a `for` loop. + +```{r} + +result <- numeric(10000) + +for (i in 1:10000) { + X <- rnorm(30, 0, 1) + result[i] <- t.test(X)$p.value +} + +sig_results <- result < 0.05 + +sum(sig_results)/10000 + + +``` + +Next, do this using the `replicate` function: + +```{r} + +pvals <- replicate(10000, t.test(rnorm(30))$p.value) + +sig_pvals <- pvals < 0.05 + +sum(sig_pvals)/10000 + +``` + +How does this compare to the intended false-positive rate of $\alpha=0.05$? + +> This answer makes sense. I'm getting a false positive rate of about 5 percent or slightly less than 5 percent. That's what I would expect given that I am testing for signifiance of ($p < .05$). + +Ok, that was a bit boring. Let's try something more interesting - let's implement a p-value sniffing simulation, in the style of Simons, Nelson, & Simonsohn (2011). + +Consider this scenario: you have done an experiment, again with 30 participants (one observation each, just for simplicity). The question is whether the true mean is different from 0. You aren't going to check the p-value every trial, but let's say you run 30 - then if the p-value is within the range p < .25 and p > .05, you optionally run 30 more and add those data, then test again. But if the original p value is < .05, you call it a day, and if the original is > .25, you also stop. + +First, write a function that implements this sampling regime. + +```{r} +double.sample <- function () { + + first_sample <- rnorm(30) + + pvals_2 <- (t.test(first_sample)$p.value) + + if(pvals_2<0.05) { + return(pvals_2) + } else if (pvals_2>0.25) { + return(pvals_2) + } else { + second_sample <- c(first_sample, rnorm(30)) + return(t.test(second_sample)$p.value) + } + +} + + +``` + +Now call this function 10k times and find out what happens. + +```{r} + +tenk_results <- replicate(10000, double.sample()) + +sum(tenk_results < 0.05)/10000 + + +``` + +Is there an inflation of false positives? How bad is it? + +> Yes, there's an inflation of false positives. I'm getting about 7 percent false positives, using the double sample technique. This is what Mike talked about in class, with p-hacking. While I may falsely believe that I am keeping p-value at less than 0.05, the double sampling technique is instead giving me slightly higher false positives. + +Now modify this code so that you can investigate this "double the sample" rule in a bit more depth. In the previous question, the researcher doubles the sample only when they think they got "close" to a significant result, i.e. when their not-significant p is less than 0.25. What if the researcher was more optimistic? See what happens in these 3 other scenarios: + +* The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.5. +* The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.75. +* The research doubles their sample whenever they get ANY pvalue that is not significant. + +How do these choices affect the false positive rate? + +HINT: Try to do this by making the function `double.sample` take the upper p value as an argument, so that you can pass this through dplyr. + +HINT 2: You may need more samples. Find out by looking at how the results change from run to run. + +```{r} + +# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.5. + +double.sample0.5 <- function () { + + first_sample_0.05 <- rnorm(30) + + pvals_2_0.05 <- (t.test(first_sample_0.05)$p.value) + + if(pvals_2_0.05<0.05) { + return(pvals_2_0.05) + } else if (pvals_2_0.05>0.5) { + return(pvals_2_0.05) + } else { + second_sample_0.05 <- c(first_sample_0.05, rnorm(30)) + return(t.test(second_sample_0.05)$p.value) + } + +} + +tenk_results_0.05 <- replicate(10000, double.sample0.5()) + +sum(tenk_results_0.05 < 0.05)/10000 + + + +# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.75. + +double.sample0.75 <- function () { + + first_sample_0.75 <- rnorm(30) + + pvals_2_0.75 <- (t.test(first_sample_0.75)$p.value) + + + if(pvals_2_0.75<0.05) { + return(pvals_2_0.75) + } else if (pvals_2_0.75>0.75) { + return(pvals_2_0.75) + } else { + second_sample_0.75 <- c(first_sample_0.75, rnorm(30)) + return(t.test(second_sample_0.75)$p.value) + } + +} + +tenk_results_0.75 <- replicate(10000, double.sample0.75()) + +sum(tenk_results_0.75 < 0.05)/10000 + + + +# The researcher doubles the sample whenever their pvalue is not significant. + +double.sample_whenever <- function () { + + first_sample_whenever <- rnorm(30) + pvals_2_whenever <- (t.test(first_sample_whenever)$p.value) + + if(pvals_2_whenever<0.05) { + return(pvals_2_whenever) + } else { + second_sample_whenever <- c(first_sample_whenever, rnorm(30)) + return(t.test(second_sample_whenever)$p.value) + } + +} + +tenk_results_whenever <- replicate(10000, double.sample_whenever()) + +sum(tenk_results_whenever < 0.05)/10000 + + + + +``` + +What do you conclude on the basis of this simulation? How bad is this kind of data-dependent policy? + +> I see an inflation of false positives. The more liberal I am with my double-sampling policy, the higher the rate of false positives. For example, when I sampled again whenever, regardless of my p value, I got an 8 percent rate of false positives. This data-dependent double-sampling policy is leading to many more false positives, even though my putative p-value is set below 0.5 diff --git a/hw4-updated.html b/hw4-updated.html new file mode 100644 index 0000000..96feec5 --- /dev/null +++ b/hw4-updated.html @@ -0,0 +1,628 @@ + + + + + + + + + + + + + + + + +Psych 251 PS4: Simulation + Analysis + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ +

This is problem set #4, in which we want you to integrate your knowledge of data wrangling with some basic simulation skills. It’s a short problem set to help consolidate your ggplot2 skills and then help you get your feet wet in testing statistical concepts through “making up data” rather than consulting a textbook or doing math.

+

For ease of reading, please separate your answers from our text by marking our text with the > character (indicating quotes).

+
+

Part 1: ggplot practice

+

This part is a warmup, it should be relatively straightforward ggplot2 practice.

+

Load data from Frank, Vul, Saxe (2011, Infancy), a study in which we measured infants’ looking to hands in moving scenes. There were infants from 3 months all the way to about two years, and there were two movie conditions (Faces_Medium, in which kids played on a white background, and Faces_Plus, in which the backgrounds were more complex and the people in the videos were both kids and adults). An eye-tracker measured children’s attention to faces. This version of the dataset only gives two conditions and only shows the amount of looking at hands (other variables were measured as well).

+
library(tidyverse)
+
fvs <- read_csv("data/FVS2011-hands.csv")
+
## Parsed with column specification:
+## cols(
+##   subid = col_double(),
+##   age = col_double(),
+##   condition = col_character(),
+##   hand.look = col_double()
+## )
+
colnames(fvs)
+
## [1] "subid"     "age"       "condition" "hand.look"
+
head(fvs)
+
## # A tibble: 6 x 4
+##   subid   age condition    hand.look
+##   <dbl> <dbl> <chr>            <dbl>
+## 1     2  3.16 Faces_Medium    0.0319
+## 2    93  5.03 Faces_Medium    0.119 
+## 3    29  5.85 Faces_Medium    0.0921
+## 4    76  5.85 Faces_Medium    0.130 
+## 5    48  6.08 Faces_Medium    0.0138
+## 6   101  6.15 Faces_Medium    0.0438
+

First, use ggplot to plot a histogram of the ages of children in the study. NOTE: this is a repeated measures design, so you can’t just take a histogram of every measurement.

+
fvs %>%
+    group_by(subid, age) %>%
+    summarise()
+
## # A tibble: 119 x 2
+## # Groups:   subid [119]
+##    subid   age
+##    <dbl> <dbl>
+##  1     1 12.0 
+##  2     2  3.16
+##  3     3  9.53
+##  4     4 15.4 
+##  5     5  9.86
+##  6     6  9.40
+##  7     7 13.6 
+##  8     8 13.3 
+##  9     9 12.5 
+## 10    10 14.5 
+## # ... with 109 more rows
+
ggplot(fvs, aes(x=fvs$age)) + geom_histogram()
+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+

Second, make a scatter plot showing hand looking as a function of age and condition. Add appropriate smoothing lines. Take the time to fix the axis labels and make the plot look nice.

+
ggplot(fvs, aes(x=fvs$age, y=fvs$hand.look)) + 
+  geom_point(shape = ".") + 
+  geom_smooth()
+
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
+

+

What do you conclude from this pattern of data?

+
+

ANSWER HERE?? Around 15 months, infants start paying more attention to hands.

+
+

What statistical analyses would you perform here to quantify these differences?

+
+

ANSWER HERE. ????

+
+
+
+

Part 2: Simulation

+
library(tidyverse)
+

Let’s start by convincing ourselves that t-tests have the appropriate false positive rate. Run 10,000 t-tests with standard, normally-distributed data from a made up 30-person, single-measurement experiment (the command for sampling from a normal distribution is rnorm).

+

The goal of these t-tests are to determine, based on 30 observations, whether the underlying distribution (in this case a normal distribution with mean 0 and standard deviation 1) has a mean that is different from 0. In reality, the mean is not different from 0 (we sampled it using rnorm), but sometimes the 30 observations we get in our experiment will suggest that the mean is higher or lower. In this case, we’ll get a “significant” result and incorrectly reject the null hypothesis of mean 0.

+

What’s the proportion of “significant” results (\(p < .05\)) that you see?

+

First do this using a for loop.

+
result <- numeric(10000)
+
+for (i in 1:10000) {
+  X <- rnorm(30, 0, 1)
+  result[i] <- t.test(X)$p.value
+}
+
+sig_results <- result < 0.05
+
+sum(sig_results)/10000
+
## [1] 0.0501
+

Next, do this using the replicate function:

+
pvals <- replicate(10000, t.test(rnorm(30))$p.value)
+
+sig_pvals <- pvals < 0.05
+
+sum(sig_pvals)/10000
+
## [1] 0.0533
+

How does this compare to the intended false-positive rate of \(\alpha=0.05\)?

+
+

This answer makes sense. I’m getting a false positive rate of about 5 percent or slightly less than 5 percent. That’s what I would expect given that I am testing for signifiance of (\(p < .05\)).

+
+

Ok, that was a bit boring. Let’s try something more interesting - let’s implement a p-value sniffing simulation, in the style of Simons, Nelson, & Simonsohn (2011).

+

Consider this scenario: you have done an experiment, again with 30 participants (one observation each, just for simplicity). The question is whether the true mean is different from 0. You aren’t going to check the p-value every trial, but let’s say you run 30 - then if the p-value is within the range p < .25 and p > .05, you optionally run 30 more and add those data, then test again. But if the original p value is < .05, you call it a day, and if the original is > .25, you also stop.

+

First, write a function that implements this sampling regime.

+
double.sample <- function () {
+  
+  first_sample <- rnorm(30)
+
+    pvals_2 <- (t.test(first_sample)$p.value)
+
+  if(pvals_2<0.05) {
+      return(pvals_2)
+  } else if (pvals_2>0.25) {
+      return(pvals_2) 
+  } else {
+      second_sample <- c(first_sample, rnorm(30))
+      return(t.test(second_sample)$p.value)
+  }
+
+}
+

Now call this function 10k times and find out what happens.

+
tenk_results <- replicate(10000, double.sample())
+
+sum(tenk_results < 0.05)/10000
+
## [1] 0.0699
+

Is there an inflation of false positives? How bad is it?

+
+

Yes, there’s an inflation of false positives. I’m getting about 7 percent false positives, using the double sample technique. This is what Mike talked about in class, with p-hacking. While I may falsely believe that I am keeping p-value at less than 0.05, the double sampling technique is instead giving me slightly higher false positives.

+
+

Now modify this code so that you can investigate this “double the sample” rule in a bit more depth. In the previous question, the researcher doubles the sample only when they think they got “close” to a significant result, i.e. when their not-significant p is less than 0.25. What if the researcher was more optimistic? See what happens in these 3 other scenarios:

+ +

How do these choices affect the false positive rate?

+

HINT: Try to do this by making the function double.sample take the upper p value as an argument, so that you can pass this through dplyr.

+

HINT 2: You may need more samples. Find out by looking at how the results change from run to run.

+
# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.5.
+
+double.sample0.5 <- function () {
+  
+  first_sample_0.05 <- rnorm(30)
+
+    pvals_2_0.05 <- (t.test(first_sample_0.05)$p.value)
+
+    if(pvals_2_0.05<0.05) {
+      return(pvals_2_0.05)
+  } else if (pvals_2_0.05>0.5) {
+      return(pvals_2_0.05) 
+  } else {
+      second_sample_0.05 <- c(first_sample_0.05, rnorm(30))
+      return(t.test(second_sample_0.05)$p.value)
+  }
+
+}
+
+tenk_results_0.05 <- replicate(10000, double.sample0.5())
+
+sum(tenk_results_0.05 < 0.05)/10000
+
## [1] 0.0815
+
# The researcher doubles the sample whenever their pvalue is not significant, but it's less than 0.75.
+
+double.sample0.75 <- function () {
+  
+  first_sample_0.75 <- rnorm(30)
+
+    pvals_2_0.75 <- (t.test(first_sample_0.75)$p.value)
+
+  
+  if(pvals_2_0.75<0.05) {
+      return(pvals_2_0.75)
+  } else if (pvals_2_0.75>0.75) {
+      return(pvals_2_0.75) 
+  } else {
+      second_sample_0.75 <- c(first_sample_0.75, rnorm(30))
+      return(t.test(second_sample_0.75)$p.value)
+  }
+
+}
+
+tenk_results_0.75 <- replicate(10000, double.sample0.75())
+
+sum(tenk_results_0.75 < 0.05)/10000
+
## [1] 0.0728
+
# The researcher doubles the sample whenever their pvalue is not significant.
+
+double.sample_whenever <- function () {
+  
+  first_sample_whenever <- rnorm(30)
+    pvals_2_whenever <- (t.test(first_sample_whenever)$p.value)
+
+  if(pvals_2_whenever<0.05) {
+      return(pvals_2_whenever)
+  } else {
+      second_sample_whenever <- c(first_sample_whenever, rnorm(30))
+      return(t.test(second_sample_whenever)$p.value)
+  }
+
+}
+
+tenk_results_whenever <- replicate(10000, double.sample_whenever())
+
+sum(tenk_results_whenever < 0.05)/10000
+
## [1] 0.0859
+

What do you conclude on the basis of this simulation? How bad is this kind of data-dependent policy?

+
+

I see an inflation of false positives. The more liberal I am with my double-sampling policy, the higher the rate of false positives. For example, when I sampled again whenever, regardless of my p value, I got an 8 percent rate of false positives. This data-dependent double-sampling policy is leading to many more false positives, even though my putative p-value is set below 0.5

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/rsconnect/documents/Ceballos_ps4.Rmd/rpubs.com/rpubs/Document.dcf b/rsconnect/documents/Ceballos_ps4.Rmd/rpubs.com/rpubs/Document.dcf new file mode 100644 index 0000000..3fb0ddc --- /dev/null +++ b/rsconnect/documents/Ceballos_ps4.Rmd/rpubs.com/rpubs/Document.dcf @@ -0,0 +1,10 @@ +name: Document +title: +username: +account: rpubs +server: rpubs.com +hostUrl: rpubs.com +appId: https://api.rpubs.com/api/v1/document/549194/a2d7a0d184e842a1bc80a0b566751e7f +bundleId: https://api.rpubs.com/api/v1/document/549194/a2d7a0d184e842a1bc80a0b566751e7f +url: http://rpubs.com/publish/claim/549194/7334c10ee0ea4cf095682e9e5e13467c +when: 1573587960.91685