Read a CSV and load packages
library(ggplot2)
library(MASS)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
= read_csv("LTER_RandomLake.csv") z
## Rows: 181 Columns: 30
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (30): wbic, secchi, depth_at_samplesite, top_hypolimnion, 440_color_10_c...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(z)
## spec_tbl_df [181 x 30] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ wbic : num [1:181] 2126 2283 2343 2392 2561 ...
## $ secchi : num [1:181] 0.58 0.6 1.85 0.65 2.88 3.25 1.45 1.45 1.45 1.45 ...
## $ depth_at_samplesite : num [1:181] 13.8 1.1 7 5.5 6.45 ...
## $ top_hypolimnion : num [1:181] 8 -999 5.5 4.5 -999 8.5 -999 -999 -999 -999 ...
## $ 440_color_10_cm : num [1:181] 0.0143 0.7261 0.1646 0.534 -0.0286 ...
## $ 440_color_1_cm : num [1:181] 0.0556 0.0648 0.0158 0.0524 0.0144 0.0083 0.0119 0.0119 0.0119 0.0119 ...
## $ alk : num [1:181] 41.5 16.6 19.7 28.6 38.1 ...
## $ pH : num [1:181] 5.46 5.12 5.8 5.55 5.98 ...
## $ dic : num [1:181] 1.53 0.56 0.8 0.55 0.83 ...
## $ doc : num [1:181] 17.04 18.38 9.58 16.07 8.9 ...
## $ tic : num [1:181] 1.64 1.17 0.96 1.13 1.05 ...
## $ toc : num [1:181] 16.9 17.91 9.39 16.3 9.14 ...
## $ totnf : num [1:181] 514 798 409 630 422 ...
## $ totnuf : num [1:181] 551 637 405 635 358 343 315 315 315 315 ...
## $ totpf : num [1:181] 15 13 10 13 15 4 24 24 24 24 ...
## $ totpuf : num [1:181] 10 15 4 18 5 2 10 10 10 10 ...
## $ cl : num [1:181] 0.24 -999 1.22 0.23 0.24 0.17 0.81 0.81 0.81 0.81 ...
## $ so4 : num [1:181] 0.72 -999 2.47 2.65 2.57 3.09 1.92 1.92 1.92 1.92 ...
## $ ca : num [1:181] 1.62 1.87 1.02 1.84 1.51 1.48 14.2 14.2 14.2 14.2 ...
## $ mg : num [1:181] 0.455 0.535 0.339 0.609 0.508 0.478 3.52 3.52 3.52 3.52 ...
## $ na : num [1:181] 0.499 0.986 0.165 0.362 0.51 0.442 1.55 1.55 1.55 1.55 ...
## $ k : num [1:181] 0.838 0.616 0.427 0.504 0.717 0.485 0.74 0.74 0.74 0.74 ...
## $ fe : num [1:181] 0.265 0.452 0.058 0.338 0.027 0.008 0.024 0.024 0.024 0.024 ...
## $ mn : num [1:181] 0.031 0.056 0.013 0.058 0.016 0.006 0 0 0 0 ...
## $ upstream_lake_area : num [1:181] 17016 27724 1079506 56602 307679 ...
## $ upstream_wetland_area: num [1:181] 191744 181517 5318068 160905 862452 ...
## $ upstream_land_area : num [1:181] 1186888 493318 9412654 606738 4529619 ...
## $ upstream_total_area : num [1:181] 1395648 702560 15810229 824245 5699750 ...
## $ watershed_area : num [1:181] 13954 216930 250420 344574 225998 ...
## $ watershed_perimeter : num [1:181] 634 2430 2430 3645 2483 ...
## - attr(*, "spec")=
## .. cols(
## .. wbic = col_double(),
## .. secchi = col_double(),
## .. depth_at_samplesite = col_double(),
## .. top_hypolimnion = col_double(),
## .. `440_color_10_cm` = col_double(),
## .. `440_color_1_cm` = col_double(),
## .. alk = col_double(),
## .. pH = col_double(),
## .. dic = col_double(),
## .. doc = col_double(),
## .. tic = col_double(),
## .. toc = col_double(),
## .. totnf = col_double(),
## .. totnuf = col_double(),
## .. totpf = col_double(),
## .. totpuf = col_double(),
## .. cl = col_double(),
## .. so4 = col_double(),
## .. ca = col_double(),
## .. mg = col_double(),
## .. na = col_double(),
## .. k = col_double(),
## .. fe = col_double(),
## .. mn = col_double(),
## .. upstream_lake_area = col_double(),
## .. upstream_wetland_area = col_double(),
## .. upstream_land_area = col_double(),
## .. upstream_total_area = col_double(),
## .. watershed_area = col_double(),
## .. watershed_perimeter = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(z)
## wbic secchi depth_at_samplesite top_hypolimnion
## Min. : 2126 Min. :-999.000 Min. : 0.100 Min. :-999.0
## 1st Qu.: 981200 1st Qu.: 0.900 1st Qu.: 2.400 1st Qu.:-999.0
## Median :1172900 Median : 1.700 Median : 4.200 Median :-999.0
## Mean :1308494 Mean : -3.517 Mean : 4.935 Mean :-715.8
## 3rd Qu.:1880700 3rd Qu.: 2.900 3rd Qu.: 7.000 3rd Qu.: 4.0
## Max. :2954700 Max. : 7.550 Max. :24.000 Max. : 14.0
## 440_color_10_cm 440_color_1_cm alk pH
## Min. :-0.0297 Min. :-999.0000 Min. : -85.69 Min. :4.093
## 1st Qu.: 0.0513 1st Qu.: 0.0000 1st Qu.: 17.76 1st Qu.:5.549
## Median : 0.1082 Median : 0.0119 Median : 48.09 Median :6.286
## Mean : 0.2276 Mean : -11.0169 Mean : 225.28 Mean :6.386
## 3rd Qu.: 0.2906 3rd Qu.: 0.0291 3rd Qu.: 320.71 3rd Qu.:7.343
## Max. : 1.3268 Max. : 0.1253 Max. :1911.97 Max. :9.062
## dic doc tic toc
## Min. :-999.00 Min. :-999.000 Min. :-999.00 Min. :-999.00
## 1st Qu.: 0.86 1st Qu.: 4.757 1st Qu.: 1.06 1st Qu.: 4.64
## Median : 1.53 Median : 7.320 Median : 1.73 Median : 7.25
## Mean : -18.59 Mean : -17.979 Mean : -18.36 Mean : -12.64
## 3rd Qu.: 4.63 3rd Qu.: 11.940 3rd Qu.: 4.78 3rd Qu.: 11.65
## Max. : 26.12 Max. : 32.140 Max. : 25.58 Max. : 31.76
## totnf totnuf totpf totpuf
## Min. :-999.0 Min. :-999.0 Min. :-999.000 Min. :-999.000
## 1st Qu.: 359.0 1st Qu.: 348.0 1st Qu.: 5.000 1st Qu.: 5.000
## Median : 454.0 Median : 458.0 Median : 9.000 Median : 10.000
## Mean : 518.5 Mean : 551.7 Mean : 3.439 Mean : 8.102
## 3rd Qu.: 630.0 3rd Qu.: 623.0 3rd Qu.: 16.000 3rd Qu.: 14.000
## Max. :1783.0 Max. :6112.0 Max. : 153.500 Max. : 180.000
## cl so4 ca mg
## Min. :-999.00 Min. :-999.00 Min. :-999.000 Min. :-999.000
## 1st Qu.: 0.19 1st Qu.: 0.65 1st Qu.: 0.875 1st Qu.: 0.265
## Median : 0.36 Median : 1.51 Median : 1.430 Median : 0.502
## Mean : -31.88 Mean : -31.60 Mean : -34.808 Mean : -37.482
## 3rd Qu.: 0.82 3rd Qu.: 2.16 3rd Qu.: 5.410 3rd Qu.: 1.640
## Max. : 38.95 Max. : 5.96 Max. : 27.500 Max. : 7.050
## na k fe mn
## Min. :-999.000 Min. :-999.000 Min. :-999.000 Min. :-999.000
## 1st Qu.: 0.177 1st Qu.: 0.317 1st Qu.: 0.024 1st Qu.: 0.003
## Median : 0.481 Median : 0.453 Median : 0.086 Median : 0.013
## Mean : -37.428 Mean : -38.125 Mean : -38.406 Mean : -38.606
## 3rd Qu.: 1.290 3rd Qu.: 0.636 3rd Qu.: 0.246 3rd Qu.: 0.027
## Max. : 40.550 Max. : 2.120 Max. : 3.010 Max. : 1.290
## upstream_lake_area upstream_wetland_area upstream_land_area
## Min. : 700 Min. :0.000e+00 Min. :1.782e+03
## 1st Qu.: 173277 1st Qu.:1.483e+05 1st Qu.:1.315e+06
## Median : 2145026 Median :2.228e+06 Median :1.472e+07
## Mean : 70011586 Mean :1.210e+08 Mean :6.262e+08
## 3rd Qu.: 37131534 3rd Qu.:3.064e+07 3rd Qu.:2.055e+08
## Max. :628532381 Max. :1.495e+09 Max. :7.532e+09
## upstream_total_area watershed_area watershed_perimeter
## Min. :3.488e+03 Min. : 696 Min. : 106
## 1st Qu.:1.560e+06 1st Qu.: 52322 1st Qu.: 1215
## Median :2.341e+07 Median : 290857 Median : 3169
## Mean :8.172e+08 Mean : 1983465 Mean : 6933
## 3rd Qu.:2.665e+08 3rd Qu.: 1328783 3rd Qu.: 8188
## Max. :9.655e+09 Max. :45529548 Max. :81133
Plot histogram of data
= ggplot(data=z[which(z$pH>-900),], aes(x=pH, y=..density..)) +
p1 geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Add empirical density curve
= p1 + geom_density(linetype="dotted",size=0.75)
p1 print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Get maximum likelihood parameters for normal
= fitdistr(z$pH,"normal")
normPars print(normPars)
## mean sd
## 6.38559669 1.11650204
## (0.08298894) (0.05868204)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 6.39 1.12
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.083 0.0587
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.00689 0 0 0.00344
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 181
## $ loglik : num -277
## - attr(*, "class")= chr "fitdistr"
$estimate["mean"] normPars
## mean
## 6.385597
Plot normal probability density
= normPars$estimate["mean"]
meanML = normPars$estimate["sd"]
sdML
= seq(0,max(z$pH),len=length(z$pH))
xval
= stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$pH), args = list(mean = meanML, sd = sdML))
stat + stat p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot exponential probability density
= fitdistr(z$pH,"exponential")
expoPars = expoPars$estimate["rate"]
rateML
= stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$pH), args = list(rate=rateML))
stat2 + stat + stat2 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot uniform probability density
= stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$pH), args = list(min=min(z$pH), max=max(z$pH)))
stat3 + stat + stat2 + stat3 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot gamma probability density
= fitdistr(z$pH,"gamma")
gammaPars = gammaPars$estimate["shape"]
shapeML = gammaPars$estimate["rate"]
rateML
= stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$pH), args = list(shape=shapeML, rate=rateML))
stat4 + stat + stat2 + stat3 + stat4 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot beta probability
= ggplot(data=z, aes(x=pH/(max(pH + 0.1)), y=..density..)) +
pSpecial geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
= fitdistr(x=z$pH/max(z$pH + 0.1),start=list(shape1=1,shape2=2),"beta") betaPars
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
= betaPars$estimate["shape1"]
shape1ML = betaPars$estimate["shape2"]
shape2ML
= stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$pH), args = list(shape1=shape1ML,shape2=shape2ML))
statSpecial + statSpecial pSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
Find best-fitting distribution
= normPars$estimate["mean"]
meanML = normPars$estimate["sd"]
sdML
= data.frame(normie=rnorm(181,mean=meanML,sd=sdML))
normal normal
## normie
## 1 8.437762
## 2 6.845985
## 3 6.935974
## 4 6.383122
## 5 7.421067
## 6 5.803758
## 7 6.197692
## 8 6.992831
## 9 4.497462
## 10 5.644131
## 11 8.245458
## 12 7.217594
## 13 5.403369
## 14 6.818706
## 15 6.396978
## 16 8.276713
## 17 8.270491
## 18 6.093556
## 19 6.322151
## 20 5.960720
## 21 3.732854
## 22 6.760916
## 23 5.769743
## 24 6.610536
## 25 6.489066
## 26 6.616025
## 27 6.167008
## 28 7.122729
## 29 4.585888
## 30 7.397930
## 31 8.290792
## 32 7.544032
## 33 7.010933
## 34 7.873565
## 35 8.106808
## 36 6.300972
## 37 6.322779
## 38 6.429633
## 39 5.644394
## 40 5.782003
## 41 5.429558
## 42 6.734243
## 43 6.454128
## 44 7.794648
## 45 6.132556
## 46 7.856098
## 47 4.750609
## 48 4.992773
## 49 5.912492
## 50 5.729506
## 51 5.566996
## 52 6.099801
## 53 5.601709
## 54 6.308303
## 55 4.168093
## 56 5.004724
## 57 6.991096
## 58 5.194764
## 59 6.401970
## 60 6.267455
## 61 6.519993
## 62 6.791870
## 63 7.671385
## 64 6.720640
## 65 6.767115
## 66 8.335310
## 67 8.666459
## 68 6.902333
## 69 7.018635
## 70 6.011070
## 71 4.909746
## 72 7.103988
## 73 5.304220
## 74 5.613933
## 75 8.359440
## 76 10.006725
## 77 6.155627
## 78 8.443128
## 79 5.144662
## 80 5.256820
## 81 6.954932
## 82 6.529961
## 83 5.472276
## 84 6.936157
## 85 7.255758
## 86 6.005721
## 87 4.544853
## 88 4.566940
## 89 5.410495
## 90 7.447519
## 91 9.348727
## 92 7.641768
## 93 3.990012
## 94 5.414314
## 95 6.791514
## 96 6.721716
## 97 7.314892
## 98 6.312417
## 99 5.619853
## 100 7.196674
## 101 5.250536
## 102 6.069673
## 103 6.903618
## 104 6.228582
## 105 6.119214
## 106 7.165483
## 107 7.915653
## 108 5.345365
## 109 5.335240
## 110 7.260422
## 111 5.554882
## 112 7.554610
## 113 6.148681
## 114 4.792103
## 115 4.684168
## 116 4.457935
## 117 7.503283
## 118 7.323538
## 119 6.121903
## 120 5.661224
## 121 6.240589
## 122 7.094950
## 123 5.861891
## 124 6.621135
## 125 6.560552
## 126 6.956924
## 127 5.653400
## 128 5.981233
## 129 7.381003
## 130 7.035780
## 131 8.350422
## 132 7.198226
## 133 6.132095
## 134 5.917939
## 135 3.893001
## 136 6.400882
## 137 5.740555
## 138 7.284367
## 139 3.923884
## 140 6.127440
## 141 7.030811
## 142 6.904800
## 143 8.528292
## 144 6.276720
## 145 6.521648
## 146 7.630109
## 147 6.808329
## 148 7.882328
## 149 6.996096
## 150 6.307669
## 151 6.226382
## 152 7.548379
## 153 6.741492
## 154 5.521910
## 155 6.728021
## 156 6.943079
## 157 7.609840
## 158 5.131671
## 159 5.760477
## 160 7.656383
## 161 5.605202
## 162 6.529774
## 163 5.949328
## 164 7.291438
## 165 4.165952
## 166 5.628009
## 167 6.028572
## 168 6.630338
## 169 6.381907
## 170 6.096944
## 171 6.121994
## 172 5.739956
## 173 6.924218
## 174 5.914126
## 175 6.634634
## 176 6.023992
## 177 4.254434
## 178 9.028208
## 179 3.369887
## 180 6.386480
## 181 6.297567
= ggplot(data=normal, aes(x=normie, y=..density..)) +
normplot geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(normplot)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
= stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(normal$normie), args = list(mean = meanML, sd = sdML))
normstat + normstat normplot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+ stat p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
How do the two histogram profiles compare? Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?
It actually does a pretty good job! pH must be pretty “normal” ;)