Lab 4: LTER Network Data

ESS 330 - Quantitative Reasoning

Author

Introduction to the Data

# Install LTER Data Sampler
#remotes::install_github("lter/lterdatasampler")

# Install necessary libraries
library(tidyverse)

Warning: package 'lubridate' was built under R version 4.4.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggpubr)

Warning: package 'ggpubr' was built under R version 4.4.3

library(lterdatasampler)

Warning: package 'lterdatasampler' was built under R version 4.4.3

library(car)

Warning: package 'car' was built under R version 4.4.3

Loading required package: carData

Warning: package 'carData' was built under R version 4.4.3


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(visdat)

Warning: package 'visdat' was built under R version 4.4.3

library(broom)
library(flextable)

Warning: package 'flextable' was built under R version 4.4.3


Attaching package: 'flextable'

The following objects are masked from 'package:ggpubr':

    border, font, rotate

The following object is masked from 'package:purrr':

    compose

# Retrieve the and_vertebrates data set
?and_vertebrates

starting httpd help server ... done

# Explore data structure
str(and_vertebrates)

tibble [32,209 × 16] (S3: tbl_df/tbl/data.frame)
 $ year       : num [1:32209] 1987 1987 1987 1987 1987 ...
 $ sitecode   : chr [1:32209] "MACKCC-L" "MACKCC-L" "MACKCC-L" "MACKCC-L" ...
 $ section    : chr [1:32209] "CC" "CC" "CC" "CC" ...
 $ reach      : chr [1:32209] "L" "L" "L" "L" ...
 $ pass       : num [1:32209] 1 1 1 1 1 1 1 1 1 1 ...
 $ unitnum    : num [1:32209] 1 1 1 1 1 1 1 1 1 1 ...
 $ unittype   : chr [1:32209] "R" "R" "R" "R" ...
 $ vert_index : num [1:32209] 1 2 3 4 5 6 7 8 9 10 ...
 $ pitnumber  : num [1:32209] NA NA NA NA NA NA NA NA NA NA ...
 $ species    : chr [1:32209] "Cutthroat trout" "Cutthroat trout" "Cutthroat trout" "Cutthroat trout" ...
 $ length_1_mm: num [1:32209] 58 61 89 58 93 86 107 131 103 117 ...
 $ length_2_mm: num [1:32209] NA NA NA NA NA NA NA NA NA NA ...
 $ weight_g   : num [1:32209] 1.75 1.95 5.6 2.15 6.9 5.9 10.5 20.6 9.55 13 ...
 $ clip       : chr [1:32209] "NONE" "NONE" "NONE" "NONE" ...
 $ sampledate : Date[1:32209], format: "1987-10-07" "1987-10-07" ...
 $ notes      : chr [1:32209] NA NA NA NA ...

and_vertebrates %>% 
  glimpse() %>% 
  vis_dat()

Rows: 32,209
Columns: 16
$ year        <dbl> 1987, 1987, 1987, 1987, 1987, 1987, 1987, 1987, 1987, 1987…
$ sitecode    <chr> "MACKCC-L", "MACKCC-L", "MACKCC-L", "MACKCC-L", "MACKCC-L"…
$ section     <chr> "CC", "CC", "CC", "CC", "CC", "CC", "CC", "CC", "CC", "CC"…
$ reach       <chr> "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L"…
$ pass        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ unitnum     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2…
$ unittype    <chr> "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R"…
$ vert_index  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, …
$ pitnumber   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ species     <chr> "Cutthroat trout", "Cutthroat trout", "Cutthroat trout", "…
$ length_1_mm <dbl> 58, 61, 89, 58, 93, 86, 107, 131, 103, 117, 100, 127, 99, …
$ length_2_mm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ weight_g    <dbl> 1.75, 1.95, 5.60, 2.15, 6.90, 5.90, 10.50, 20.60, 9.55, 13…
$ clip        <chr> "NONE", "NONE", "NONE", "NONE", "NONE", "NONE", "NONE", "N…
$ sampledate  <date> 1987-10-07, 1987-10-07, 1987-10-07, 1987-10-07, 1987-10-0…
$ notes       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Part 1: Univariate and Bivariate Staistics

Guided Data Analysis Part 1

## Guided Data Analysis Part 1
and_vertebrates %>% 
  filter(species == "Cutthroat trout") %>% 
  drop_na(unittype) %>% 
  count(unittype)

# A tibble: 7 × 2
  unittype     n
  <chr>    <int>
1 C        11419
2 I           23
3 IP         105
4 P         5470
5 R          420
6 S            9
7 SC        2377

trout_clean <- and_vertebrates %>% 
  filter(species == "Cutthroat trout") %>%    # Filter for cutthroat
  filter(unittype %in% c("C", "P", "SC")) %>%   # Filter for the 3 most abundant unittypes
  drop_na(unittype, section)   # Drop NA values for unittype and section

# Save Cutthroat trout table
cont_table <- table(trout_clean$section, trout_clean$unittype)

# Conduct chi-squared test on the Cutthroat trout data
chisq.test(cont_table)


    Pearson's Chi-squared test

data:  cont_table
X-squared = 188.68, df = 2, p-value < 2.2e-16

# Plot the Cutthroat trout data

# Bar plot
trout_clean_barplot <- trout_clean %>% 
  count(unittype, section) %>% 
  ggbarplot(x = 'unittype', y = 'n',
            fill = 'section',
            palette = c("#00AFBB", "#E7B800"))

# Box Plot
trout_clean_boxplot <- trout_clean %>% 
  drop_na(weight_g) %>% 
  ggviolin(x = "section", y = "weight_g",
           add = "boxplot",
           color = "section",
           palette = c("#00AFBB", "#E7B800"))
  
# T-Test Assumptions
cc_weight <- trout_clean %>% 
  filter(section == "CC") %>% 
  pull(weight_g)

og_weight <- trout_clean %>%  
  filter(section == "OG") %>% 
  pull(weight_g)

var.test(cc_weight, og_weight)


    F test to compare two variances

data:  cc_weight and og_weight
F = 1.2889, num df = 6310, denom df = 5225, p-value < 2.2e-16
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 1.223686 1.357398
sample estimates:
ratio of variances 
          1.288892

# Plot Histograms
ggarrange(gghistogram(cc_weight, main = "Clear Cut"), gghistogram(og_weight, main = "Old Growth"))

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

Warning: Removed 4273 rows containing non-finite outside the scale range
(`stat_bin()`).

Warning: Removed 3456 rows containing non-finite outside the scale range
(`stat_bin()`).

# Test log normalization
var.test(log(cc_weight), log(og_weight))


    F test to compare two variances

data:  log(cc_weight) and log(og_weight)
F = 1.0208, num df = 6310, denom df = 5225, p-value = 0.4374
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.9691443 1.0750427
sample estimates:
ratio of variances 
          1.020787

# Default t-test with log-normalized data
t.test(log(trout_clean$weight_g) ~ trout_clean$section, var.equal = TRUE)


    Two Sample t-test

data:  log(trout_clean$weight_g) by trout_clean$section
t = 2.854, df = 11535, p-value = 0.004324
alternative hypothesis: true difference in means between group CC and group OG is not equal to 0
95 percent confidence interval:
 0.02222425 0.11969560
sample estimates:
mean in group CC mean in group OG 
        1.457042         1.386082

# Welch Two Sample t-test
t.test(trout_clean$weight_g ~ trout_clean$section, var.equal = FALSE)


    Welch Two Sample t-test

data:  trout_clean$weight_g by trout_clean$section
t = 4.5265, df = 11491, p-value = 6.056e-06
alternative hypothesis: true difference in means between group CC and group OG is not equal to 0
95 percent confidence interval:
 0.4642016 1.1733126
sample estimates:
mean in group CC mean in group OG 
        8.988807         8.170050

# Coastal Giant Salamander
# Filter for salamander data
sally_clean <- and_vertebrates %>% 
  filter(species == "Coastal giant salamander") %>% 
  drop_na(length_2_mm, weight_g)   # Remove NA values from the data.

# Display salamander histograms
ggarrange(gghistogram(sally_clean$length_2_mm, title = "Length"),
gghistogram(sally_clean$weight_g, title = "Weight"))

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# Test for normal distribution
s <- sally_clean  %>%  
  slice_sample(n = 5000) 

shapiro.test(s$length_2_mm)


    Shapiro-Wilk normality test

data:  s$length_2_mm
W = 0.93061, p-value < 2.2e-16

shapiro.test(s$weight_g)


    Shapiro-Wilk normality test

data:  s$weight_g
W = 0.55188, p-value < 2.2e-16

# Display log-normalized Salamander histograms
ggarrange(
 gghistogram(log(sally_clean$length_2_mm), title = "Length"), 
 gghistogram(log(sally_clean$weight_g), title = "Weight") )

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# Correlation Test for Salamander Data
cor.test(log(sally_clean$length_2_mm), log(sally_clean$weight_g))


    Pearson's product-moment correlation

data:  log(sally_clean$length_2_mm) and log(sally_clean$weight_g)
t = 402.85, df = 6229, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9804036 0.9822403
sample estimates:
      cor 
0.9813443

# Visualized correlation for Salamander Data
sally_clean %>%
  mutate(log_length = log(length_2_mm), log_weight = log(weight_g)) %>% 
  ggscatter(x = 'log_length',
            y = 'log_weight',
            alpha = .35,
            add = "loess")

# Spearman Correlation Test for Salamander Data
cor.test(sally_clean$length_2_mm, sally_clean$weight_g, method = "spearman")

Warning in cor.test.default(sally_clean$length_2_mm, sally_clean$weight_g, :
Cannot compute exact p-value with ties


    Spearman's rank correlation rho

data:  sally_clean$length_2_mm and sally_clean$weight_g
S = 819296957, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.9796802

Part 1 Lab Exercise

## Question 1-1: Conduct a chi-square test similar to the one carried out above, but test for a relationship between forest type (`section`) and channel unit (`unittype`) for Coastal giant salamander abundance.

# Filter and clean salamander data
sal_clean <- and_vertebrates %>% 
  filter(species == "Coastal giant salamander") %>%    # Filter for salamanders
  drop_na(unittype, section)   # Drop NA values for unittype and section

# Save Salamander contingency table
sal_cont_table <- table(sal_clean$section, sal_clean$unittype)


# Chi-Sq Test for Section and Unittype
sal_chisq_result <- chisq.test(sal_cont_table)

Warning in chisq.test(sal_cont_table): Chi-squared approximation may be
incorrect

# Box Plot for Chi-sqaure test
sal_box_plot <- sal_clean %>% 
  count(unittype, section) %>% 
  ggpubr::ggbarplot(x = 'unittype', y = 'n',
                    fill = 'section',
                    palette = c("darkblue", "darkorange"),
                    add = "mean_se")

Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced

Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced
Warning in stats::qt(ci/2 + 0.5, data_sum$length - 1): NaNs produced

#ggsave("sal_chisq_test.png", plot = sal_box_plot, width = 10, height = 6)

# Print Chi-sq results
print(sal_chisq_result)


    Pearson's Chi-squared test

data:  sal_cont_table
X-squared = 200.71, df = 5, p-value < 2.2e-16

print(sal_box_plot)

## Question 1-2: Test the hypothesis that there is a significant difference in species biomass between clear cut and old growth forest types for the Coastal Giant salamander.


# Clean theadditional data
sal_clean <- sal_clean %>% 
  drop_na(weight_g)

# Equal Variance Test
sal_cc_weight <- sal_clean %>% 
  filter(section == "CC") %>% 
  pull(weight_g)

sal_og_weight <- sal_clean %>%  
  filter(section == "OG") %>% 
  pull(weight_g)

var.test(sal_cc_weight, sal_og_weight)


    F test to compare two variances

data:  sal_cc_weight and sal_og_weight
F = 0.82901, num df = 3027, denom df = 3309, p-value = 1.439e-07
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.7732148 0.8889213
sample estimates:
ratio of variances 
         0.8290065

# Test for normal distribution
sal <- sal_clean %>%  
  slice_sample(n = 5000) 

# Visualizing data normality
ggarrange(gghistogram(sal_cc_weight, main = "Clear Cut"),
          ggpubr::gghistogram(sal_og_weight, main = "Old Growth"))

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# Test log normalization
var.test(log(sal_cc_weight), log(sal_og_weight))


    F test to compare two variances

data:  log(sal_cc_weight) and log(sal_og_weight)
F = 0.90548, num df = 3027, denom df = 3309, p-value = 0.005299
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.8445382 0.9709179
sample estimates:
ratio of variances 
         0.9054763

# Welch Two Sample t-test
t.test(sal_clean$weight_g ~ sal_clean$section, var.equal = FALSE)


    Welch Two Sample t-test

data:  sal_clean$weight_g by sal_clean$section
t = 4.9255, df = 6335.9, p-value = 8.629e-07
alternative hypothesis: true difference in means between group CC and group OG is not equal to 0
95 percent confidence interval:
 0.8978633 2.0850725
sample estimates:
mean in group CC mean in group OG 
        9.810634         8.319166

## Question 1-3: Test the correlation between body length (snout to fork length) and body mass for Cutthroat trout. 

# Find the length variable (?and_vertebrates conducted in Chunk #1).
  # Variable is length_1_mm

# Clean the data
trout_clean <- and_vertebrates %>% 
  filter(species == "Cutthroat trout") %>% 
  drop_na(length_1_mm, weight_g)

# Look at distribution of variables
ggarrange(gghistogram(trout_clean$length_1_mm, title = "Length"),
          gghistogram(trout_clean$weight_g, title = "Weight"))

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# Shapiro-Wilk normality test only runs for 5000 observations 
trout_sample <- trout_clean %>% 
  slice_sample(n = 5000)   # Adjust data frame to only contain 5000 obs.

# Run S-W Norm Test for both vars
shapiro.test(trout_sample$length_1_mm)


    Shapiro-Wilk normality test

data:  trout_sample$length_1_mm
W = 0.94572, p-value < 2.2e-16

shapiro.test(trout_sample$weight_g)


    Shapiro-Wilk normality test

data:  trout_sample$weight_g
W = 0.81028, p-value < 2.2e-16

# Log transform vizualization
ggarrange(
  gghistogram(log(trout_clean$length_1_mm), title = "Length"),
  gghistogram(log(trout_clean$weight_g), title = "Weight")
)

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# Spearman Correlation Test
cor.test(trout_clean$length_1_mm, trout_clean$weight_g, method = "spearman")

Warning in cor.test.default(trout_clean$length_1_mm, trout_clean$weight_g, :
Cannot compute exact p-value with ties


    Spearman's rank correlation rho

data:  trout_clean$length_1_mm and trout_clean$weight_g
S = 2669679446, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.9919772

Question 1-1:

The chi-squared = 200.71, df = 5, p-value is less than 0.05 (P < 2.2e-16), which means that there is a statistically significant relationship between forest type and channel unit in regards to coastal giant salamander abundance. This supports rejecting the null hypothesis.

Question 1-2:

The results of the variance test suggest that the variences are not equal. The p-value is less than 0.05 (p < 1.439e-07). This supports rejecting the null hypothesis that the variances are equal. Visualizing the normality of the data, it appears right skewed, suggesting that a log transformation should be used to normalize the data or that a Welsch t-test should be conducted. Testing log normalization returned a p-value less than 0.05 (p < 5.299e-03) meaning even after adjustment we must reject the null hypothesis that the data is normally distrubuted. Given that the log normalization didn’ work a Welsh t-test is required. The p-value for the Welch two sample t-test is less than 0.05 (p < 8.629e-07). This p-value supports rejecting the null hypothesis as the coastal giant salamander did not have the same weight in the two forest sections.

Question 1-3:

Both variables appear skewed (not normally distributed). Shapiro-Wilk normality test used to confirm. The p-value is less than 0.05 for both (p < 2.2e-16), so we reject the null hypothesis, meaning that our data does not fit a normal distribution. The data can either be log-normalized or use the Pearson’s correlation test or we can use the Spearman correlation test. The data appears to follow a non-normal distribution, requiring the Spearman correlation test. The p-value is less than 0.05 (p < 2.2e-16), we reject the null. This supports a significant, positive relationship between length and weight for the Cutthroat trout. There is also a very high correlation coefficient.

Part 2: Multivariate Statistics

Guided Data Analysis Part 2

# Data set
data("pie_crab")

# Explore the data
glimpse(pie_crab)

Rows: 392
Columns: 9
$ date          <date> 2016-07-24, 2016-07-24, 2016-07-24, 2016-07-24, 2016-07…
$ latitude      <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, …
$ site          <chr> "GTM", "GTM", "GTM", "GTM", "GTM", "GTM", "GTM", "GTM", …
$ size          <dbl> 12.43, 14.18, 14.52, 12.94, 12.45, 12.99, 10.32, 11.19, …
$ air_temp      <dbl> 21.792, 21.792, 21.792, 21.792, 21.792, 21.792, 21.792, …
$ air_temp_sd   <dbl> 6.391, 6.391, 6.391, 6.391, 6.391, 6.391, 6.391, 6.391, …
$ water_temp    <dbl> 24.502, 24.502, 24.502, 24.502, 24.502, 24.502, 24.502, …
$ water_temp_sd <dbl> 6.121, 6.121, 6.121, 6.121, 6.121, 6.121, 6.121, 6.121, …
$ name          <chr> "Guana Tolomoto Matanzas NERR", "Guana Tolomoto Matanzas…

vis_dat(pie_crab)

?pie_crab

# sample size per site
count(pie_crab, site)

# A tibble: 13 × 2
   site      n
   <chr> <int>
 1 BC       37
 2 CC       27
 3 CT       33
 4 DB       30
 5 GTM      28
 6 JC       30
 7 NB       29
 8 NIB      30
 9 PIE      28
10 RC       25
11 SI       30
12 VCR      30
13 ZI       35

summary(pie_crab)

      date               latitude         site                size      
 Min.   :2016-07-24   Min.   :30.00   Length:392         Min.   : 6.64  
 1st Qu.:2016-07-28   1st Qu.:34.00   Class :character   1st Qu.:12.02  
 Median :2016-08-01   Median :39.10   Mode  :character   Median :14.44  
 Mean   :2016-08-02   Mean   :37.69                      Mean   :14.66  
 3rd Qu.:2016-08-09   3rd Qu.:41.60                      3rd Qu.:17.34  
 Max.   :2016-08-13   Max.   :42.70                      Max.   :23.43  
    air_temp      air_temp_sd      water_temp    water_temp_sd  
 Min.   :10.29   Min.   :6.391   Min.   :13.98   Min.   :4.838  
 1st Qu.:12.05   1st Qu.:8.110   1st Qu.:14.33   1st Qu.:6.567  
 Median :13.93   Median :8.410   Median :17.50   Median :6.998  
 Mean   :15.20   Mean   :8.654   Mean   :17.65   Mean   :7.252  
 3rd Qu.:18.63   3rd Qu.:9.483   3rd Qu.:20.54   3rd Qu.:7.865  
 Max.   :21.79   Max.   :9.965   Max.   :24.50   Max.   :9.121  
     name          
 Length:392        
 Class :character  
 Mode  :character

# ANOVA
pie_crab |> 
  ggboxplot(x = 'site', y = 'size', col = 'site') +
  geom_jitter(size =.25) + 
  theme(legend.postition = "none")

Warning in plot_theme(plot): The `legend.postition` theme element is not
defined in the element hierarchy.

# Assumptions 
# Normality
norms <- pie_crab |> 
  nest(data = -site) |>
  mutate(Shapiro = map(data, ~ shapiro.test(.x$size)),
         n = map_dbl(data, nrow),
         glance_shapiro = map(Shapiro, broom::glance)) |>
  unnest(glance_shapiro)

flextable::flextable(dplyr::select(norms, site, n, statistic, p.value)) |>
  flextable::set_caption("Shapiro-Wilk normality test for size at each site")

site	n	statistic	p.value
GTM	28	0.9007814	0.0119337484
SI	30	0.9705352	0.5539208550
NIB	30	0.9728297	0.6191340731
ZI	35	0.9744583	0.5765589900
RC	25	0.9315062	0.0941588802
VCR	30	0.9444682	0.1200262239
DB	30	0.9576271	0.2690631942
JC	30	0.9634754	0.3788327942
CT	33	0.9277365	0.0301785639
NB	29	0.9675367	0.4949587443
CC	27	0.9354659	0.0941803007
BC	37	0.8885721	0.0014402753
PIE	28	0.8489399	0.0008899392

# Residuals
(res_aov <- aov(size ~ site, data = pie_crab))

Call:
   aov(formula = size ~ site, data = pie_crab)

Terms:
                    site Residuals
Sum of Squares  2172.376  2626.421
Deg. of Freedom       12       379

Residual standard error: 2.632465
Estimated effects may be unbalanced

gghistogram(res_aov$residuals)

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

shapiro.test(res_aov$residuals)


    Shapiro-Wilk normality test

data:  res_aov$residuals
W = 0.99708, p-value = 0.7122

# Equal Variances
leveneTest(size ~ site, data = pie_crab)

Warning in leveneTest.default(y = y, group = group, ...): group coerced to
factor.

Levene's Test for Homogeneity of Variance (center = median)
       Df F value    Pr(>F)    
group  12  9.2268 1.151e-15 ***
      379                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ANOVA Tests
# Welch's ANOVA
oneway.test(size ~ site, data = pie_crab, var.equal = FALSE)


    One-way analysis of means (not assuming equal variances)

data:  size and site
F = 39.108, num df = 12.00, denom df = 145.79, p-value < 2.2e-16

# Filter a subset of the sites
pie_sites <- pie_crab |> 
  filter(site %in% c("GTM", "DB", "PIE"))

# Check for equal variance
leveneTest(size ~ site, data = pie_sites)

Warning in leveneTest.default(y = y, group = group, ...): group coerced to
factor.

Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group  2   0.548 0.5802
      83

# Note that the variances are equal (p = 0.5802), so we can proceed with the ANOVA

# ANOVA for the data subset
pie_anova <- aov(size ~ site, data = pie_sites)

# View the ANOVA results 
summary(pie_anova)

            Df Sum Sq Mean Sq F value Pr(>F)    
site         2  521.5  260.75   60.02 <2e-16 ***
Residuals   83  360.6    4.34                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Post-hoc Tukey's HSD Test
TukeyHSD(pie_anova)

  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = size ~ site, data = pie_sites)

$site
             diff       lwr       upr   p adj
GTM-DB  -3.200786 -4.507850 -1.893722 3.0e-07
PIE-DB   2.899929  1.592865  4.206992 2.9e-06
PIE-GTM  6.100714  4.771306  7.430123 0.0e+00

# Linear Regression
pie_lm <- lm(size ~ latitude, data = pie_crab)

# View the results of the linear model
summary(pie_lm)


Call:
lm(formula = size ~ latitude, data = pie_crab)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.8376 -1.8797  0.1144  1.9484  6.9280 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.62442    1.27405  -2.845  0.00468 ** 
latitude     0.48512    0.03359  14.441  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.832 on 390 degrees of freedom
Multiple R-squared:  0.3484,    Adjusted R-squared:  0.3467 
F-statistic: 208.5 on 1 and 390 DF,  p-value: < 2.2e-16

# Linear regression visualized
pie_crab |> 
  ggscatter(x = 'latitude', y = 'size', 
            alpha = .35, 
            add = "reg.line")

# Predictions from linear regression
new_lat <- data.frame(latitude = c(32, 36, 38))
broom::augment(pie_lm, newdata = new_lat)

# A tibble: 3 × 2
  latitude .fitted
     <dbl>   <dbl>
1       32    11.9
2       36    13.8
3       38    14.8

# Multiple linear regression
pie_mlm <- lm(size ~ latitude + air_temp + water_temp, data = pie_crab)

summary(pie_mlm)


Call:
lm(formula = size ~ latitude + air_temp + water_temp, data = pie_crab)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.7099 -1.7195 -0.0602  1.7823  7.7271 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  77.7460    17.3477   4.482 9.76e-06 ***
latitude     -1.0587     0.3174  -3.336 0.000933 ***
air_temp     -2.4041     0.3844  -6.255 1.05e-09 ***
water_temp    0.7563     0.1465   5.162 3.92e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.677 on 388 degrees of freedom
Multiple R-squared:  0.4206,    Adjusted R-squared:  0.4161 
F-statistic:  93.9 on 3 and 388 DF,  p-value: < 2.2e-16

# Test mlm for correlations
pie_crab |> 
  select(latitude, air_temp, water_temp) |> 
  cor()

             latitude   air_temp water_temp
latitude    1.0000000 -0.9949715 -0.9571738
air_temp   -0.9949715  1.0000000  0.9632287
water_temp -0.9571738  0.9632287  1.0000000

Part 2 Lab Exercise

## Prework for Question 2-1
pie_crab %>% 
  ggboxplot(x = 'site', y = 'size', col = 'site') +
  geom_jitter(size =.25) +
  theme(legend.position = "none")

# Shapiro-Wilk Normality Test
norms <- pie_crab %>%  
  nest(data = -site) %>% 
  mutate(Shapiro = map(data, ~ shapiro.test(.x$size)),
         n = map_dbl(data, nrow),
         glance_shapiro = map(Shapiro, broom::glance)) %>%
  unnest(glance_shapiro)

flextable(select(norms, site, n, statistic, p.value)) %>% 
  set_caption("Shapiro-Wilk Normality Test for Size at Each Site")

site	n	statistic	p.value
GTM	28	0.9007814	0.0119337484
SI	30	0.9705352	0.5539208550
NIB	30	0.9728297	0.6191340731
ZI	35	0.9744583	0.5765589900
RC	25	0.9315062	0.0941588802
VCR	30	0.9444682	0.1200262239
DB	30	0.9576271	0.2690631942
JC	30	0.9634754	0.3788327942
CT	33	0.9277365	0.0301785639
NB	29	0.9675367	0.4949587443
CC	27	0.9354659	0.0941803007
BC	37	0.8885721	0.0014402753
PIE	28	0.8489399	0.0008899392

# Normal distribution check
(res_aov <- aov(size ~ site, data = pie_crab))

Call:
   aov(formula = size ~ site, data = pie_crab)

Terms:
                    site Residuals
Sum of Squares  2172.376  2626.421
Deg. of Freedom       12       379

Residual standard error: 2.632465
Estimated effects may be unbalanced

gghistogram(res_aov$residuals)

Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

shapiro.test(res_aov$residuals)


    Shapiro-Wilk normality test

data:  res_aov$residuals
W = 0.99708, p-value = 0.7122

# Levene's Test
leveneTest(size ~ site, data = pie_crab)

Warning in leveneTest.default(y = y, group = group, ...): group coerced to
factor.

Levene's Test for Homogeneity of Variance (center = median)
       Df F value    Pr(>F)    
group  12  9.2268 1.151e-15 ***
      379                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Welch's ANOVA
oneway.test(size ~ site, data = pie_crab, var.equal = FALSE)


    One-way analysis of means (not assuming equal variances)

data:  size and site
F = 39.108, num df = 12.00, denom df = 145.79, p-value < 2.2e-16

# Filter a subset of the sites
pie_sites <- pie_crab |> 
  filter(site %in% c("GTM", "DB", "PIE"))

# Check for equal variance
leveneTest(size ~ site, data = pie_sites)

Warning in leveneTest.default(y = y, group = group, ...): group coerced to
factor.

Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group  2   0.548 0.5802
      83

# Note that the variances are equal (p = 0.5802), so we can proceed with the ANOVA

# ANOVA for the data subset
pie_anova <- aov(size ~ site, data = pie_sites)

# View the ANOVA results 
summary(pie_anova)

            Df Sum Sq Mean Sq F value Pr(>F)    
site         2  521.5  260.75   60.02 <2e-16 ***
Residuals   83  360.6    4.34                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Post-hoc Tukey's HSD Test
TukeyHSD(pie_anova)

  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = size ~ site, data = pie_sites)

$site
             diff       lwr       upr   p adj
GTM-DB  -3.200786 -4.507850 -1.893722 3.0e-07
PIE-DB   2.899929  1.592865  4.206992 2.9e-06
PIE-GTM  6.100714  4.771306  7.430123 0.0e+00

## Question 2-1
# Organize by latitude
pie_crab <- pie_crab %>% 
  arrange(latitude) %>% 
  mutate(site = factor(site, levels = unique(site)))

# Plot data
box_plot_crab_sites <- ggplot(pie_crab, aes(x = site, y = size)) +
  geom_boxplot(fill = "orange", color = "black") +
  geom_jitter(size =.25) +
  labs(title = "Carapace Width by Site, Ordered by Latitude",
       x = "Site",
       y = "Carapace Width (mm)") +
  theme_minimal()   
print(box_plot_crab_sites)

#ggsave("box_plot_crab_sites.png", plot = box_plot_crab_sites, width = 10, height = 6, dpi = 300)

## Question 2-2
# Linear regression model
pie_lm_water <- lm(size ~ water_temp_sd, data = pie_crab)
summary(pie_lm_water)


Call:
lm(formula = size ~ water_temp_sd, data = pie_crab)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.9428 -2.6948 -0.2145  2.6573  8.8070 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   13.93728    1.15338  12.084   <2e-16 ***
water_temp_sd  0.09938    0.15716   0.632    0.528    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.506 on 390 degrees of freedom
Multiple R-squared:  0.001024,  Adjusted R-squared:  -0.001537 
F-statistic: 0.3999 on 1 and 390 DF,  p-value: 0.5275

# Linear Regression Line of Best fit
LOBF <- pie_crab %>% 
  ggscatter(x = 'water_temp_sd', y = 'size',
  alpha = .35,
  add = "reg.line")
print(LOBF)

#ggsave("LOBF.png", plot = LOBF, width = 10, height = 6, dpi = 300)

## Question 2-3
# Check for correlations
pie_crab %>% 
  dplyr::select(latitude, air_temp_sd, water_temp_sd) %>% 
  cor()

                latitude air_temp_sd water_temp_sd
latitude      1.00000000   0.7932130    0.04188273
air_temp_sd   0.79321301   1.0000000    0.40970338
water_temp_sd 0.04188273   0.4097034    1.00000000

# Multiple linear regression model
crab_mlm <- lm(size ~ latitude + air_temp_sd + water_temp_sd, data = pie_crab)
summary(crab_mlm)


Call:
lm(formula = size ~ latitude + air_temp_sd + water_temp_sd, data = pie_crab)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.7515 -1.8897  0.0506  1.9301  6.6746 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -3.96880    1.54818  -2.564   0.0107 *  
latitude       0.55940    0.06413   8.723   <2e-16 ***
air_temp_sd   -0.41713    0.30559  -1.365   0.1730    
water_temp_sd  0.15927    0.16174   0.985   0.3254    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.832 on 388 degrees of freedom
Multiple R-squared:  0.3516,    Adjusted R-squared:  0.3466 
F-statistic: 70.13 on 3 and 388 DF,  p-value: < 2.2e-16

Question 2-1:

The results of the ANOVA test are highly significant. At least one site has significantly different carapace widths than the other sites. We know this because the p-value is less than 0.05 (p < 2.2e-16), leading us to reject the null hypothesis. The Tukey’s HSD Test shows us that the site relationships: GTM-DB, PIE-DB, and PIE-GTM are statistically significant, because their p-values are all less than 0.05. This represents that each of these related sites are significantly different from each other.

Question 2-2:

The p-value is greater than 0.05, we fail to reject the null hypothesis. There is not enough statistical evidence to represent a significant relationship between water_temp_sd and crab carapace width.

Question 2-3:

air_temp_sd and latitude have a higher correlation coefficient, with it being greater than 0.7 (0.79321301), but water_temp_sd and air_temp_sd, and latitude and water_temp_sd, are not highly correlated. Highly correlated variables increase the complexity of the linear regression model. The overall p-value is less than 0.05 (p < 2.2e-16), suggesting a significant impact from the combination of predictors on crab carapace width. The individual p-values vary. Latitude is the only one less than 0.05, meaning it is the only statistically significant relationship with an effect on crab size. air_temp_sd and water_temp_sd both have p-values greater than 0.05, failing to reject the null hypothesis. There is not enough statistical evidence to represent a statistically significant relationship between these two predictors and crab carapace width.