Homework 8

1. Think about an ongoing study in your lab (or a paper you have read in a different class), and decide on a pattern that you might expect in your experiment if a specific hypothesis were true.

I’m using a paper I read recently related to my research. To narrow the scope for the purpose of this assignment, I’ll focus on the effect of iron (Fe), phosphorus (P), and nitrogen (N) enrichment in a lake experiment on chlorophyll-a concentrations. North et al. 2007

2. To start simply, assume that the data in each of your treatment groups follow a normal distribution. Specify the sample sizes, means, and variances for each group that would be reasonable if your hypothesis were true. You may need to consult some previous literature and/or an expert in the field to come up with these numbers.

Control: n = 12; mean = ~1.2; variance = ~0.05

Fe: n = 12; mean = ~1.3; variance = ~0.05

Fe, P, N: n = 12; mean = ~5.7; variance = ~0.75

P, N: n = 12; mean = ~3.0; variance = ~0.3

3. Using the methods we have covered in class, write code to create a random data set that has these attributes. Organize these data into a data frame with the appropriate structure.

n_group = 4
n_name = c("Control","Fe","Fe, P, N", "P, N")
n_size = c(12,12,12,12)
n_mean = c(1.2,1.3,5.7,3.0)
n_sd = c(0.05,0.05,0.75,0.3)

ID = 1:sum(n_size)

res_var = c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
            rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
            rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
            rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))

trt_group = rep(n_name,n_size)

anova_data = data.frame(ID,trt_group,res_var)

head(anova_data)

##   ID trt_group  res_var
## 1  1   Control 1.103597
## 2  2   Control 1.196972
## 3  3   Control 1.239624
## 4  4   Control 1.169237
## 5  5   Control 1.199440
## 6  6   Control 1.178811

4. Now write code to analyze the data (probably as an ANOVA or regression analysis, but possibly as a logistic regression or contingency table analysis. Write code to generate a useful graph of the data.

anova_model = aov(res_var~trt_group,data=anova_data)
print(anova_model)

## Call:
##    aov(formula = res_var ~ trt_group, data = anova_data)
## 
## Terms:
##                 trt_group Residuals
## Sum of Squares  164.26628   7.16618
## Deg. of Freedom         3        44
## 
## Residual standard error: 0.4035688
## Estimated effects may be unbalanced

z = summary(anova_model)
print(z)

##             Df Sum Sq Mean Sq F value Pr(>F)    
## trt_group    3 164.27   54.76   336.2 <2e-16 ***
## Residuals   44   7.17    0.16                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

flat_out = unlist(z)
anova_stats = list(f_ratio=unlist(z)[7],
                   f_pval=unlist(z)[9])

print(anova_stats)

## $f_ratio
## F value1 
## 336.1956 
## 
## $f_pval
##      Pr(>F)1 
## 2.445589e-30

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

anova_plot = ggplot(anova_data) +
  aes(x=trt_group,y=res_var) +
  geom_boxplot()
anova_plot

5. Try running your analysis multiple times to get a feeling for how variable the results are with the same parameters, but different sets of random numbers.

2nd analysis and plot:

n_group = 4
n_name = c("Control","Fe","Fe, P, N", "P, N")
n_size = c(12,12,12,12)
n_mean = c(1.2,1.3,5.7,3.0)
n_sd = c(0.05,0.05,0.75,0.3)

ID = 1:sum(n_size)

res_var = c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
            rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
            rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
            rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))

trt_group = rep(n_name,n_size)

anova_data = data.frame(ID,trt_group,res_var)

head(anova_data)

##   ID trt_group  res_var
## 1  1   Control 1.238174
## 2  2   Control 1.219169
## 3  3   Control 1.147382
## 4  4   Control 1.093499
## 5  5   Control 1.262232
## 6  6   Control 1.184556

anova_model = aov(res_var~trt_group,data=anova_data)
print(anova_model)

## Call:
##    aov(formula = res_var ~ trt_group, data = anova_data)
## 
## Terms:
##                 trt_group Residuals
## Sum of Squares  166.53545   2.30185
## Deg. of Freedom         3        44
## 
## Residual standard error: 0.2287242
## Estimated effects may be unbalanced

z = summary(anova_model)
print(z)

##             Df Sum Sq Mean Sq F value Pr(>F)    
## trt_group    3  166.5   55.51    1061 <2e-16 ***
## Residuals   44    2.3    0.05                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

flat_out = unlist(z)
anova_stats = list(f_ratio=unlist(z)[7],
                   f_pval=unlist(z)[9])

print(anova_stats)

## $f_ratio
## F value1 
## 1061.112 
## 
## $f_pval
##      Pr(>F)1 
## 4.892128e-41

anova_plot = ggplot(anova_data) +
  aes(x=trt_group,y=res_var) +
  geom_boxplot()
anova_plot

3rd analysis and plot:

n_group = 4
n_name = c("Control","Fe","Fe, P, N", "P, N")
n_size = c(12,12,12,12)
n_mean = c(1.2,1.3,5.7,3.0)
n_sd = c(0.05,0.05,0.75,0.3)

ID = 1:sum(n_size)

res_var = c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
            rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
            rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
            rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))

trt_group = rep(n_name,n_size)

anova_data = data.frame(ID,trt_group,res_var)

head(anova_data)

##   ID trt_group  res_var
## 1  1   Control 1.201413
## 2  2   Control 1.053553
## 3  3   Control 1.122351
## 4  4   Control 1.170083
## 5  5   Control 1.128019
## 6  6   Control 1.257643

anova_model = aov(res_var~trt_group,data=anova_data)
print(anova_model)

## Call:
##    aov(formula = res_var ~ trt_group, data = anova_data)
## 
## Terms:
##                 trt_group Residuals
## Sum of Squares  174.72514   7.45883
## Deg. of Freedom         3        44
## 
## Residual standard error: 0.4117267
## Estimated effects may be unbalanced

z = summary(anova_model)
print(z)

##             Df Sum Sq Mean Sq F value Pr(>F)    
## trt_group    3 174.73   58.24   343.6 <2e-16 ***
## Residuals   44   7.46    0.17                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

flat_out = unlist(z)
anova_stats = list(f_ratio=unlist(z)[7],
                   f_pval=unlist(z)[9])

print(anova_stats)

## $f_ratio
## F value1 
## 343.5706 
## 
## $f_pval
##      Pr(>F)1 
## 1.548166e-30

anova_plot = ggplot(anova_data) +
  aes(x=trt_group,y=res_var) +
  geom_boxplot()
anova_plot

6. Now begin adjusting the means of the different groups. Given the sample sizes you have chosen, how small can the differences between the groups be (the “effect size”) for you to still detect a significant pattern (p < 0.05)?

I ran it a couple of times, and I eventually found mean values which barely was not significant.

n_group = 4
n_name = c("Control","Fe","Fe, P, N", "P, N")
n_size = c(12,12,12,12)
n_mean = c(3.1,3.2,3.3,3.6)
n_sd = c(0.05,0.05,0.75,0.3)

ID = 1:sum(n_size)

res_var = c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
            rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
            rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
            rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))

trt_group = rep(n_name,n_size)

anova_data = data.frame(ID,trt_group,res_var)

head(anova_data)

##   ID trt_group  res_var
## 1  1   Control 3.169954
## 2  2   Control 3.106953
## 3  3   Control 3.040261
## 4  4   Control 3.025672
## 5  5   Control 3.038975
## 6  6   Control 3.132346

anova_model = aov(res_var~trt_group,data=anova_data)
print(anova_model)

## Call:
##    aov(formula = res_var ~ trt_group, data = anova_data)
## 
## Terms:
##                 trt_group Residuals
## Sum of Squares   2.120061  3.435392
## Deg. of Freedom         3        44
## 
## Residual standard error: 0.2794228
## Estimated effects may be unbalanced

z = summary(anova_model)
print(z)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## trt_group    3  2.120  0.7067   9.051 8.79e-05 ***
## Residuals   44  3.435  0.0781                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

flat_out = unlist(z)
anova_stats = list(f_ratio=unlist(z)[7],
                   f_pval=unlist(z)[9])

print(anova_stats)

## $f_ratio
## F value1 
## 9.051145 
## 
## $f_pval
##      Pr(>F)1 
## 8.790254e-05

anova_plot = ggplot(anova_data) +
  aes(x=trt_group,y=res_var) +
  geom_boxplot()
anova_plot

7. Alternatively, for the effect sizes you originally hypothesized, what is the minimum sample size you would need in order to detect a statistically significant effect? Again, run the model a few times with the same parameter set to get a feeling for the effect of random variation in the data.

For some reason, it was statistically significant for a sample size of 2 and then obviously with a sample size of 1 it was N/A.

n_group = 4
n_name = c("Control","Fe","Fe, P, N", "P, N")
n_size = c(1,1,1,1)
n_mean = c(1.2,1.3,5.7,3.0)
n_sd = c(0.05,0.05,0.75,0.3)

ID = 1:sum(n_size)

res_var = c(rnorm(n=n_size[1],mean=n_mean[1],sd=n_sd[1]),
            rnorm(n=n_size[2],mean=n_mean[2],sd=n_sd[2]),
            rnorm(n=n_size[3],mean=n_mean[3],sd=n_sd[3]),
            rnorm(n=n_size[4],mean=n_mean[4],sd=n_sd[4]))

trt_group = rep(n_name,n_size)

anova_data = data.frame(ID,trt_group,res_var)

head(anova_data)

##   ID trt_group  res_var
## 1  1   Control 1.279825
## 2  2        Fe 1.307285
## 3  3  Fe, P, N 6.224698
## 4  4      P, N 3.547986

anova_model = aov(res_var~trt_group,data=anova_data)
print(anova_model)

## Call:
##    aov(formula = res_var ~ trt_group, data = anova_data)
## 
## Terms:
##                 trt_group
## Sum of Squares   16.49089
## Deg. of Freedom         3
## 
## Estimated effects may be unbalanced

z = summary(anova_model)
print(z)

##             Df Sum Sq Mean Sq
## trt_group    3  16.49   5.497

flat_out = unlist(z)
anova_stats = list(f_ratio=unlist(z)[7],
                   f_pval=unlist(z)[9])

print(anova_stats)

## $f_ratio
## <NA> 
##   NA 
## 
## $f_pval
## <NA> 
##   NA

anova_plot = ggplot(anova_data) +
  aes(x=trt_group,y=res_var) +
  geom_boxplot()
anova_plot