Homework 6

Read a CSV and load packages

library(ggplot2)
library(MASS)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()

z = read_csv("LTER_RandomLake.csv")

## Rows: 181 Columns: 30

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (30): wbic, secchi, depth_at_samplesite, top_hypolimnion, 440_color_10_c...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(z)

## spec_tbl_df [181 x 30] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ wbic                 : num [1:181] 2126 2283 2343 2392 2561 ...
##  $ secchi               : num [1:181] 0.58 0.6 1.85 0.65 2.88 3.25 1.45 1.45 1.45 1.45 ...
##  $ depth_at_samplesite  : num [1:181] 13.8 1.1 7 5.5 6.45 ...
##  $ top_hypolimnion      : num [1:181] 8 -999 5.5 4.5 -999 8.5 -999 -999 -999 -999 ...
##  $ 440_color_10_cm      : num [1:181] 0.0143 0.7261 0.1646 0.534 -0.0286 ...
##  $ 440_color_1_cm       : num [1:181] 0.0556 0.0648 0.0158 0.0524 0.0144 0.0083 0.0119 0.0119 0.0119 0.0119 ...
##  $ alk                  : num [1:181] 41.5 16.6 19.7 28.6 38.1 ...
##  $ pH                   : num [1:181] 5.46 5.12 5.8 5.55 5.98 ...
##  $ dic                  : num [1:181] 1.53 0.56 0.8 0.55 0.83 ...
##  $ doc                  : num [1:181] 17.04 18.38 9.58 16.07 8.9 ...
##  $ tic                  : num [1:181] 1.64 1.17 0.96 1.13 1.05 ...
##  $ toc                  : num [1:181] 16.9 17.91 9.39 16.3 9.14 ...
##  $ totnf                : num [1:181] 514 798 409 630 422 ...
##  $ totnuf               : num [1:181] 551 637 405 635 358 343 315 315 315 315 ...
##  $ totpf                : num [1:181] 15 13 10 13 15 4 24 24 24 24 ...
##  $ totpuf               : num [1:181] 10 15 4 18 5 2 10 10 10 10 ...
##  $ cl                   : num [1:181] 0.24 -999 1.22 0.23 0.24 0.17 0.81 0.81 0.81 0.81 ...
##  $ so4                  : num [1:181] 0.72 -999 2.47 2.65 2.57 3.09 1.92 1.92 1.92 1.92 ...
##  $ ca                   : num [1:181] 1.62 1.87 1.02 1.84 1.51 1.48 14.2 14.2 14.2 14.2 ...
##  $ mg                   : num [1:181] 0.455 0.535 0.339 0.609 0.508 0.478 3.52 3.52 3.52 3.52 ...
##  $ na                   : num [1:181] 0.499 0.986 0.165 0.362 0.51 0.442 1.55 1.55 1.55 1.55 ...
##  $ k                    : num [1:181] 0.838 0.616 0.427 0.504 0.717 0.485 0.74 0.74 0.74 0.74 ...
##  $ fe                   : num [1:181] 0.265 0.452 0.058 0.338 0.027 0.008 0.024 0.024 0.024 0.024 ...
##  $ mn                   : num [1:181] 0.031 0.056 0.013 0.058 0.016 0.006 0 0 0 0 ...
##  $ upstream_lake_area   : num [1:181] 17016 27724 1079506 56602 307679 ...
##  $ upstream_wetland_area: num [1:181] 191744 181517 5318068 160905 862452 ...
##  $ upstream_land_area   : num [1:181] 1186888 493318 9412654 606738 4529619 ...
##  $ upstream_total_area  : num [1:181] 1395648 702560 15810229 824245 5699750 ...
##  $ watershed_area       : num [1:181] 13954 216930 250420 344574 225998 ...
##  $ watershed_perimeter  : num [1:181] 634 2430 2430 3645 2483 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   wbic = col_double(),
##   ..   secchi = col_double(),
##   ..   depth_at_samplesite = col_double(),
##   ..   top_hypolimnion = col_double(),
##   ..   `440_color_10_cm` = col_double(),
##   ..   `440_color_1_cm` = col_double(),
##   ..   alk = col_double(),
##   ..   pH = col_double(),
##   ..   dic = col_double(),
##   ..   doc = col_double(),
##   ..   tic = col_double(),
##   ..   toc = col_double(),
##   ..   totnf = col_double(),
##   ..   totnuf = col_double(),
##   ..   totpf = col_double(),
##   ..   totpuf = col_double(),
##   ..   cl = col_double(),
##   ..   so4 = col_double(),
##   ..   ca = col_double(),
##   ..   mg = col_double(),
##   ..   na = col_double(),
##   ..   k = col_double(),
##   ..   fe = col_double(),
##   ..   mn = col_double(),
##   ..   upstream_lake_area = col_double(),
##   ..   upstream_wetland_area = col_double(),
##   ..   upstream_land_area = col_double(),
##   ..   upstream_total_area = col_double(),
##   ..   watershed_area = col_double(),
##   ..   watershed_perimeter = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(z)

##       wbic             secchi         depth_at_samplesite top_hypolimnion 
##  Min.   :   2126   Min.   :-999.000   Min.   : 0.100      Min.   :-999.0  
##  1st Qu.: 981200   1st Qu.:   0.900   1st Qu.: 2.400      1st Qu.:-999.0  
##  Median :1172900   Median :   1.700   Median : 4.200      Median :-999.0  
##  Mean   :1308494   Mean   :  -3.517   Mean   : 4.935      Mean   :-715.8  
##  3rd Qu.:1880700   3rd Qu.:   2.900   3rd Qu.: 7.000      3rd Qu.:   4.0  
##  Max.   :2954700   Max.   :   7.550   Max.   :24.000      Max.   :  14.0  
##  440_color_10_cm   440_color_1_cm           alk                pH       
##  Min.   :-0.0297   Min.   :-999.0000   Min.   : -85.69   Min.   :4.093  
##  1st Qu.: 0.0513   1st Qu.:   0.0000   1st Qu.:  17.76   1st Qu.:5.549  
##  Median : 0.1082   Median :   0.0119   Median :  48.09   Median :6.286  
##  Mean   : 0.2276   Mean   : -11.0169   Mean   : 225.28   Mean   :6.386  
##  3rd Qu.: 0.2906   3rd Qu.:   0.0291   3rd Qu.: 320.71   3rd Qu.:7.343  
##  Max.   : 1.3268   Max.   :   0.1253   Max.   :1911.97   Max.   :9.062  
##       dic               doc                tic               toc         
##  Min.   :-999.00   Min.   :-999.000   Min.   :-999.00   Min.   :-999.00  
##  1st Qu.:   0.86   1st Qu.:   4.757   1st Qu.:   1.06   1st Qu.:   4.64  
##  Median :   1.53   Median :   7.320   Median :   1.73   Median :   7.25  
##  Mean   : -18.59   Mean   : -17.979   Mean   : -18.36   Mean   : -12.64  
##  3rd Qu.:   4.63   3rd Qu.:  11.940   3rd Qu.:   4.78   3rd Qu.:  11.65  
##  Max.   :  26.12   Max.   :  32.140   Max.   :  25.58   Max.   :  31.76  
##      totnf            totnuf           totpf              totpuf        
##  Min.   :-999.0   Min.   :-999.0   Min.   :-999.000   Min.   :-999.000  
##  1st Qu.: 359.0   1st Qu.: 348.0   1st Qu.:   5.000   1st Qu.:   5.000  
##  Median : 454.0   Median : 458.0   Median :   9.000   Median :  10.000  
##  Mean   : 518.5   Mean   : 551.7   Mean   :   3.439   Mean   :   8.102  
##  3rd Qu.: 630.0   3rd Qu.: 623.0   3rd Qu.:  16.000   3rd Qu.:  14.000  
##  Max.   :1783.0   Max.   :6112.0   Max.   : 153.500   Max.   : 180.000  
##        cl               so4                ca                 mg          
##  Min.   :-999.00   Min.   :-999.00   Min.   :-999.000   Min.   :-999.000  
##  1st Qu.:   0.19   1st Qu.:   0.65   1st Qu.:   0.875   1st Qu.:   0.265  
##  Median :   0.36   Median :   1.51   Median :   1.430   Median :   0.502  
##  Mean   : -31.88   Mean   : -31.60   Mean   : -34.808   Mean   : -37.482  
##  3rd Qu.:   0.82   3rd Qu.:   2.16   3rd Qu.:   5.410   3rd Qu.:   1.640  
##  Max.   :  38.95   Max.   :   5.96   Max.   :  27.500   Max.   :   7.050  
##        na                 k                  fe                 mn          
##  Min.   :-999.000   Min.   :-999.000   Min.   :-999.000   Min.   :-999.000  
##  1st Qu.:   0.177   1st Qu.:   0.317   1st Qu.:   0.024   1st Qu.:   0.003  
##  Median :   0.481   Median :   0.453   Median :   0.086   Median :   0.013  
##  Mean   : -37.428   Mean   : -38.125   Mean   : -38.406   Mean   : -38.606  
##  3rd Qu.:   1.290   3rd Qu.:   0.636   3rd Qu.:   0.246   3rd Qu.:   0.027  
##  Max.   :  40.550   Max.   :   2.120   Max.   :   3.010   Max.   :   1.290  
##  upstream_lake_area  upstream_wetland_area upstream_land_area 
##  Min.   :      700   Min.   :0.000e+00     Min.   :1.782e+03  
##  1st Qu.:   173277   1st Qu.:1.483e+05     1st Qu.:1.315e+06  
##  Median :  2145026   Median :2.228e+06     Median :1.472e+07  
##  Mean   : 70011586   Mean   :1.210e+08     Mean   :6.262e+08  
##  3rd Qu.: 37131534   3rd Qu.:3.064e+07     3rd Qu.:2.055e+08  
##  Max.   :628532381   Max.   :1.495e+09     Max.   :7.532e+09  
##  upstream_total_area watershed_area     watershed_perimeter
##  Min.   :3.488e+03   Min.   :     696   Min.   :  106      
##  1st Qu.:1.560e+06   1st Qu.:   52322   1st Qu.: 1215      
##  Median :2.341e+07   Median :  290857   Median : 3169      
##  Mean   :8.172e+08   Mean   : 1983465   Mean   : 6933      
##  3rd Qu.:2.665e+08   3rd Qu.: 1328783   3rd Qu.: 8188      
##  Max.   :9.655e+09   Max.   :45529548   Max.   :81133

Plot histogram of data

p1 = ggplot(data=z[which(z$pH>-900),], aes(x=pH, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Add empirical density curve

p1 =  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get maximum likelihood parameters for normal

normPars = fitdistr(z$pH,"normal")
print(normPars)

##       mean          sd    
##   6.38559669   1.11650204 
##  (0.08298894) (0.05868204)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 6.39 1.12
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.083 0.0587
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.00689 0 0 0.00344
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 181
##  $ loglik  : num -277
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"]

##     mean 
## 6.385597

Plot normal probability density

meanML = normPars$estimate["mean"]
sdML = normPars$estimate["sd"]

xval = seq(0,max(z$pH),len=length(z$pH))

stat = stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$pH), args = list(mean = meanML, sd = sdML))
p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot exponential probability density

expoPars = fitdistr(z$pH,"exponential")
rateML = expoPars$estimate["rate"]

stat2 = stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$pH), args = list(rate=rateML))
p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot uniform probability density

stat3 = stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$pH), args = list(min=min(z$pH), max=max(z$pH)))
p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability density

gammaPars = fitdistr(z$pH,"gamma")
shapeML = gammaPars$estimate["shape"]
rateML = gammaPars$estimate["rate"]

stat4 = stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$pH), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot beta probability

pSpecial = ggplot(data=z, aes(x=pH/(max(pH + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars = fitdistr(x=z$pH/max(z$pH + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML = betaPars$estimate["shape1"]
shape2ML = betaPars$estimate["shape2"]

statSpecial = stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$pH), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Find best-fitting distribution

meanML = normPars$estimate["mean"]
sdML = normPars$estimate["sd"]

normal = data.frame(normie=rnorm(181,mean=meanML,sd=sdML))
normal

##        normie
## 1    8.437762
## 2    6.845985
## 3    6.935974
## 4    6.383122
## 5    7.421067
## 6    5.803758
## 7    6.197692
## 8    6.992831
## 9    4.497462
## 10   5.644131
## 11   8.245458
## 12   7.217594
## 13   5.403369
## 14   6.818706
## 15   6.396978
## 16   8.276713
## 17   8.270491
## 18   6.093556
## 19   6.322151
## 20   5.960720
## 21   3.732854
## 22   6.760916
## 23   5.769743
## 24   6.610536
## 25   6.489066
## 26   6.616025
## 27   6.167008
## 28   7.122729
## 29   4.585888
## 30   7.397930
## 31   8.290792
## 32   7.544032
## 33   7.010933
## 34   7.873565
## 35   8.106808
## 36   6.300972
## 37   6.322779
## 38   6.429633
## 39   5.644394
## 40   5.782003
## 41   5.429558
## 42   6.734243
## 43   6.454128
## 44   7.794648
## 45   6.132556
## 46   7.856098
## 47   4.750609
## 48   4.992773
## 49   5.912492
## 50   5.729506
## 51   5.566996
## 52   6.099801
## 53   5.601709
## 54   6.308303
## 55   4.168093
## 56   5.004724
## 57   6.991096
## 58   5.194764
## 59   6.401970
## 60   6.267455
## 61   6.519993
## 62   6.791870
## 63   7.671385
## 64   6.720640
## 65   6.767115
## 66   8.335310
## 67   8.666459
## 68   6.902333
## 69   7.018635
## 70   6.011070
## 71   4.909746
## 72   7.103988
## 73   5.304220
## 74   5.613933
## 75   8.359440
## 76  10.006725
## 77   6.155627
## 78   8.443128
## 79   5.144662
## 80   5.256820
## 81   6.954932
## 82   6.529961
## 83   5.472276
## 84   6.936157
## 85   7.255758
## 86   6.005721
## 87   4.544853
## 88   4.566940
## 89   5.410495
## 90   7.447519
## 91   9.348727
## 92   7.641768
## 93   3.990012
## 94   5.414314
## 95   6.791514
## 96   6.721716
## 97   7.314892
## 98   6.312417
## 99   5.619853
## 100  7.196674
## 101  5.250536
## 102  6.069673
## 103  6.903618
## 104  6.228582
## 105  6.119214
## 106  7.165483
## 107  7.915653
## 108  5.345365
## 109  5.335240
## 110  7.260422
## 111  5.554882
## 112  7.554610
## 113  6.148681
## 114  4.792103
## 115  4.684168
## 116  4.457935
## 117  7.503283
## 118  7.323538
## 119  6.121903
## 120  5.661224
## 121  6.240589
## 122  7.094950
## 123  5.861891
## 124  6.621135
## 125  6.560552
## 126  6.956924
## 127  5.653400
## 128  5.981233
## 129  7.381003
## 130  7.035780
## 131  8.350422
## 132  7.198226
## 133  6.132095
## 134  5.917939
## 135  3.893001
## 136  6.400882
## 137  5.740555
## 138  7.284367
## 139  3.923884
## 140  6.127440
## 141  7.030811
## 142  6.904800
## 143  8.528292
## 144  6.276720
## 145  6.521648
## 146  7.630109
## 147  6.808329
## 148  7.882328
## 149  6.996096
## 150  6.307669
## 151  6.226382
## 152  7.548379
## 153  6.741492
## 154  5.521910
## 155  6.728021
## 156  6.943079
## 157  7.609840
## 158  5.131671
## 159  5.760477
## 160  7.656383
## 161  5.605202
## 162  6.529774
## 163  5.949328
## 164  7.291438
## 165  4.165952
## 166  5.628009
## 167  6.028572
## 168  6.630338
## 169  6.381907
## 170  6.096944
## 171  6.121994
## 172  5.739956
## 173  6.924218
## 174  5.914126
## 175  6.634634
## 176  6.023992
## 177  4.254434
## 178  9.028208
## 179  3.369887
## 180  6.386480
## 181  6.297567

normplot = ggplot(data=normal, aes(x=normie, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(normplot)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

normstat = stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(normal$normie), args = list(mean = meanML, sd = sdML))
normplot + normstat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How do the two histogram profiles compare? Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?

It actually does a pretty good job! pH must be pretty “normal” ;)