1. Think about an ongoing study in your lab (or a paper you have read in a different class), and decide on a pattern that you might expect in your experiment if a specific hypothesis were true.
I’m using a paper I read recently related to my research. To narrow the scope for the purpose of this assignment, I’ll focus on the effect of iron (Fe), phosphorus (P), and nitrogen (N) enrichment in a lake experiment on chlorophyll-a concentrations. North et al. 2007
2. To start simply, assume that the data in each of your treatment groups follow a normal distribution. Specify the sample sizes, means, and variances for each group that would be reasonable if your hypothesis were true. You may need to consult some previous literature and/or an expert in the field to come up with these numbers.
Control: n = 12; mean = ~1.2; variance = ~0.05
Fe: n = 12; mean = ~1.3; variance = ~0.05
Fe, P, N: n = 12; mean = ~5.7; variance = ~0.75
P, N: n = 12; mean = ~3.0; variance = ~0.3
3. Using the methods we have covered in class, write code to create a random data set that has these attributes. Organize these data into a data frame with the appropriate structure.
= 4
n_group = c("Control","Fe","Fe, P, N", "P, N")
n_name = c(12,12,12,12)
n_size = c(1.2,1.3,5.7,3.0)
n_mean = c(0.05,0.05,0.75,0.3)
n_sd
= 1:sum(n_size)
ID
= c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
res_var rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))
= rep(n_name,n_size)
trt_group
= data.frame(ID,trt_group,res_var)
anova_data
head(anova_data)
## ID trt_group res_var
## 1 1 Control 1.103597
## 2 2 Control 1.196972
## 3 3 Control 1.239624
## 4 4 Control 1.169237
## 5 5 Control 1.199440
## 6 6 Control 1.178811
4. Now write code to analyze the data (probably as an ANOVA or regression analysis, but possibly as a logistic regression or contingency table analysis. Write code to generate a useful graph of the data.
= aov(res_var~trt_group,data=anova_data)
anova_model print(anova_model)
## Call:
## aov(formula = res_var ~ trt_group, data = anova_data)
##
## Terms:
## trt_group Residuals
## Sum of Squares 164.26628 7.16618
## Deg. of Freedom 3 44
##
## Residual standard error: 0.4035688
## Estimated effects may be unbalanced
= summary(anova_model)
z print(z)
## Df Sum Sq Mean Sq F value Pr(>F)
## trt_group 3 164.27 54.76 336.2 <2e-16 ***
## Residuals 44 7.17 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
= unlist(z)
flat_out = list(f_ratio=unlist(z)[7],
anova_stats f_pval=unlist(z)[9])
print(anova_stats)
## $f_ratio
## F value1
## 336.1956
##
## $f_pval
## Pr(>F)1
## 2.445589e-30
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
= ggplot(anova_data) +
anova_plot aes(x=trt_group,y=res_var) +
geom_boxplot()
anova_plot
5. Try running your analysis multiple times to get a feeling for how variable the results are with the same parameters, but different sets of random numbers.
2nd analysis and plot:
= 4
n_group = c("Control","Fe","Fe, P, N", "P, N")
n_name = c(12,12,12,12)
n_size = c(1.2,1.3,5.7,3.0)
n_mean = c(0.05,0.05,0.75,0.3)
n_sd
= 1:sum(n_size)
ID
= c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
res_var rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))
= rep(n_name,n_size)
trt_group
= data.frame(ID,trt_group,res_var)
anova_data
head(anova_data)
## ID trt_group res_var
## 1 1 Control 1.238174
## 2 2 Control 1.219169
## 3 3 Control 1.147382
## 4 4 Control 1.093499
## 5 5 Control 1.262232
## 6 6 Control 1.184556
= aov(res_var~trt_group,data=anova_data)
anova_model print(anova_model)
## Call:
## aov(formula = res_var ~ trt_group, data = anova_data)
##
## Terms:
## trt_group Residuals
## Sum of Squares 166.53545 2.30185
## Deg. of Freedom 3 44
##
## Residual standard error: 0.2287242
## Estimated effects may be unbalanced
= summary(anova_model)
z print(z)
## Df Sum Sq Mean Sq F value Pr(>F)
## trt_group 3 166.5 55.51 1061 <2e-16 ***
## Residuals 44 2.3 0.05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
= unlist(z)
flat_out = list(f_ratio=unlist(z)[7],
anova_stats f_pval=unlist(z)[9])
print(anova_stats)
## $f_ratio
## F value1
## 1061.112
##
## $f_pval
## Pr(>F)1
## 4.892128e-41
= ggplot(anova_data) +
anova_plot aes(x=trt_group,y=res_var) +
geom_boxplot()
anova_plot
3rd analysis and plot:
= 4
n_group = c("Control","Fe","Fe, P, N", "P, N")
n_name = c(12,12,12,12)
n_size = c(1.2,1.3,5.7,3.0)
n_mean = c(0.05,0.05,0.75,0.3)
n_sd
= 1:sum(n_size)
ID
= c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
res_var rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))
= rep(n_name,n_size)
trt_group
= data.frame(ID,trt_group,res_var)
anova_data
head(anova_data)
## ID trt_group res_var
## 1 1 Control 1.201413
## 2 2 Control 1.053553
## 3 3 Control 1.122351
## 4 4 Control 1.170083
## 5 5 Control 1.128019
## 6 6 Control 1.257643
= aov(res_var~trt_group,data=anova_data)
anova_model print(anova_model)
## Call:
## aov(formula = res_var ~ trt_group, data = anova_data)
##
## Terms:
## trt_group Residuals
## Sum of Squares 174.72514 7.45883
## Deg. of Freedom 3 44
##
## Residual standard error: 0.4117267
## Estimated effects may be unbalanced
= summary(anova_model)
z print(z)
## Df Sum Sq Mean Sq F value Pr(>F)
## trt_group 3 174.73 58.24 343.6 <2e-16 ***
## Residuals 44 7.46 0.17
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
= unlist(z)
flat_out = list(f_ratio=unlist(z)[7],
anova_stats f_pval=unlist(z)[9])
print(anova_stats)
## $f_ratio
## F value1
## 343.5706
##
## $f_pval
## Pr(>F)1
## 1.548166e-30
= ggplot(anova_data) +
anova_plot aes(x=trt_group,y=res_var) +
geom_boxplot()
anova_plot
6. Now begin adjusting the means of the different groups. Given the sample sizes you have chosen, how small can the differences between the groups be (the “effect size”) for you to still detect a significant pattern (p < 0.05)?
I ran it a couple of times, and I eventually found mean values which barely was not significant.
= 4
n_group = c("Control","Fe","Fe, P, N", "P, N")
n_name = c(12,12,12,12)
n_size = c(3.1,3.2,3.3,3.6)
n_mean = c(0.05,0.05,0.75,0.3)
n_sd
= 1:sum(n_size)
ID
= c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
res_var rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))
= rep(n_name,n_size)
trt_group
= data.frame(ID,trt_group,res_var)
anova_data
head(anova_data)
## ID trt_group res_var
## 1 1 Control 3.169954
## 2 2 Control 3.106953
## 3 3 Control 3.040261
## 4 4 Control 3.025672
## 5 5 Control 3.038975
## 6 6 Control 3.132346
= aov(res_var~trt_group,data=anova_data)
anova_model print(anova_model)
## Call:
## aov(formula = res_var ~ trt_group, data = anova_data)
##
## Terms:
## trt_group Residuals
## Sum of Squares 2.120061 3.435392
## Deg. of Freedom 3 44
##
## Residual standard error: 0.2794228
## Estimated effects may be unbalanced
= summary(anova_model)
z print(z)
## Df Sum Sq Mean Sq F value Pr(>F)
## trt_group 3 2.120 0.7067 9.051 8.79e-05 ***
## Residuals 44 3.435 0.0781
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
= unlist(z)
flat_out = list(f_ratio=unlist(z)[7],
anova_stats f_pval=unlist(z)[9])
print(anova_stats)
## $f_ratio
## F value1
## 9.051145
##
## $f_pval
## Pr(>F)1
## 8.790254e-05
= ggplot(anova_data) +
anova_plot aes(x=trt_group,y=res_var) +
geom_boxplot()
anova_plot
7. Alternatively, for the effect sizes you originally hypothesized, what is the minimum sample size you would need in order to detect a statistically significant effect? Again, run the model a few times with the same parameter set to get a feeling for the effect of random variation in the data.
For some reason, it was statistically significant for a sample size of 2 and then obviously with a sample size of 1 it was N/A.
= 4
n_group = c("Control","Fe","Fe, P, N", "P, N")
n_name = c(1,1,1,1)
n_size = c(1.2,1.3,5.7,3.0)
n_mean = c(0.05,0.05,0.75,0.3)
n_sd
= 1:sum(n_size)
ID
= c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
res_var rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))
= rep(n_name,n_size)
trt_group
= data.frame(ID,trt_group,res_var)
anova_data
head(anova_data)
## ID trt_group res_var
## 1 1 Control 1.279825
## 2 2 Fe 1.307285
## 3 3 Fe, P, N 6.224698
## 4 4 P, N 3.547986
= aov(res_var~trt_group,data=anova_data)
anova_model print(anova_model)
## Call:
## aov(formula = res_var ~ trt_group, data = anova_data)
##
## Terms:
## trt_group
## Sum of Squares 16.49089
## Deg. of Freedom 3
##
## Estimated effects may be unbalanced
= summary(anova_model)
z print(z)
## Df Sum Sq Mean Sq
## trt_group 3 16.49 5.497
= unlist(z)
flat_out = list(f_ratio=unlist(z)[7],
anova_stats f_pval=unlist(z)[9])
print(anova_stats)
## $f_ratio
## <NA>
## NA
##
## $f_pval
## <NA>
## NA
= ggplot(anova_data) +
anova_plot aes(x=trt_group,y=res_var) +
geom_boxplot()
anova_plot