Lab 6: Machine Learning in Hydrology

ESS 330 - Quantitative Reasoning

Author

Setup

# Load necessary libraries
library(tidyverse)

Warning: package 'lubridate' was built under R version 4.4.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.2
✔ recipes      1.1.0

Warning: package 'scales' was built under R version 4.4.3

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Dig deeper into tidy modeling with R at https://www.tmwr.org

library(powerjoin)

Warning: package 'powerjoin' was built under R version 4.4.3

library(glue)
library(vip)

Warning: package 'vip' was built under R version 4.4.3


Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi

library(baguette)

Warning: package 'baguette' was built under R version 4.4.3

library(patchwork)
library(ggplot2)
library(ggthemes)

Warning: package 'ggthemes' was built under R version 4.4.3

library(ggpubr)

Warning: package 'ggpubr' was built under R version 4.4.3

library(tidymodels)
library(recipes)
library(yardstick)
library(xgboost)

Warning: package 'xgboost' was built under R version 4.4.3


Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice

library(purrr)
library(kernlab)


Attaching package: 'kernlab'

The following object is masked from 'package:scales':

    alpha

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    alpha

library(rsample)

# Download data
root  <- 'https://gdex.ucar.edu/dataset/camels/file'

# Download metadata and documentation
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf', mode = "wb")

Warning in
download.file("https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf",
: URL https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf:
cannot open destfile 'data/camels_attributes_v2.0.pdf', reason 'No such file or
directory'

Warning in
download.file("https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf",
: download had nonzero exit status

# Get data specific text files
types <- c("clim", "geol", "soil", "topo", "vege", "hydro")

# Construct URLs and file names for the data
remote_files <- glue('{root}/camels_{types}.txt')
local_files <- glue('../data/lab_data/camels_hydro_data/camels_{types}.txt')

# Download specific data
walk2(remote_files, local_files, download.file, quiet = TRUE)

# Read and merge data
camels <- map(local_files, read_delim, show_col_types = FALSE) 
camels <- power_full_join(camels ,by = 'gauge_id')

Quesiton 1:

zero_q_freq represents the frequency of days where Q = 0 mm/day indicating no discharge/flow.

Question 2:

# Gauge Locations Plot
p_mean_plot <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = p_mean)) + labs(title = "Mean Daily Precipitation Across the Contintnetal US", color = "Precipitation (mm)") +
  scale_color_gradient(low = "tan", high = "darkgreen") + 
  theme_map() +
  coord_fixed(1.3) + # Ensures correct aspect ratio
    theme(legend.position = "bottom",
          plot.title = element_text(hjust = 0.5))  # Center the title horizontally
  

# Aridity Plot
aridity_plot <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat))+
  borders("state", colour = "gray50") + geom_point(aes(color = aridity)) + labs(title = "Aridity Across the Contintnetal US", color = "Aridity Index") + scale_color_gradient(low = "dodgerblue", high = "red") + 
  theme_map() +
  coord_fixed(1.3) +  # Ensures correct aspect ratio +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5)) # Center the title horizontally
  


# Combine Plots
combined_maps <- ggarrange(
  p_mean_plot, aridity_plot,
  ncol = 1, nrow = 2,  # Stack vertically
  common.legend = FALSE,  # Separate legends
  legend = "bottom",  # Place the legend at the bottom
  heights = c(1, 1)  # Ensure both plots have equal height
)

# Display combined plots
combined_maps

Question 3:

Lab Activity:

# Model Preparation
camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()

           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000

# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")

`geom_smooth()` using formula = 'y ~ x'

# Create a scatter plot of aridity vs rainfall with log axes
ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")

`geom_smooth()` using formula = 'y ~ x'

# Scale the legend to the log scale plot
ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")

`geom_smooth()` using formula = 'y ~ x'

# Building the Model
set.seed(123)
# Bad form to perform simple transformations on the outcome variable within a 
# recipe. So, we'll do it here.
camels <- camels |> 
  mutate(logQmean = log(q_mean))

# Generate the split
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)

# Create a recipe to preprocess the data
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  # Log transform the predictor variables (aridity and p_mean)
  step_log(all_predictors()) %>%
  # Add an interaction term between aridity and p_mean
  step_interact(terms = ~ aridity:p_mean) |> 
  # Drop any rows with missing values in the pred
  step_naomit(all_predictors(), all_outcomes())

# Prepare the data
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

# Interaction with lm
#  Base lm sets interaction terms with the * symbol
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)


Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16

# Sanity Interaction term from recipe ... these should be equal!!
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))


Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16

Question 3 Deliverable: Adjusted WF set

# Data Validation
# prep %>% bake %>% predict
test_data <- bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)

# Model Evaluation
  #Statistical
  metrics(test_data, truth = logQmean, estimate = lm_pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390

  # Visual
  ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

# Alternative Method: Workflow
# Define model
lm_model <- linear_reg() %>%
  # define the engine
  set_engine("lm") %>%
  # define the mode
  set_mode("regression")

# Instantiate a workflow ...
lm_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(lm_model) %>%
  # Fit the model to the training data
  fit(data = camels_train) 

# Adding other models to the workflow
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(rf_model) %>%
  # Fit the model
  fit(data = camels_train)

rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)

[1] 135  60

# Adding xgboost
xg_model <- boost_tree() %>% 
  set_engine("xgboost") %>% 
  set_mode("regression") 

xg_wf <- workflow() %>% 
  add_recipe(rec) %>%       # Adding recipe
  add_model(xg_model) %>%   # Adding model
  fit(data = camels_train)  # Fitting model

xg_data <- augment(xg_wf, new_data = camels_test)
dim(xg_data)

[1] 135  60

# Adding an nnet
nn_model <- bag_mlp() %>% 
  set_engine("nnet") %>% 
  set_mode("regression") 

nn_wf <- workflow() %>% 
  add_recipe(rec) %>%       # Adding recipe
  add_model(nn_model) %>%   # Adding model
  fit(data = camels_train)  # Fitting model

nn_data <- augment(nn_wf, new_data = camels_test)
dim(xg_data)

[1] 135  60

metrics(rf_data, truth = logQmean, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.588
2 rsq     standard       0.740
3 mae     standard       0.365

ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

# Workflowset approach
wf <- workflow_set(list(rec), list(lm_model, rf_model, xg_model, nn_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv)

Warning: package 'ranger' was built under R version 4.4.3

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)

# A tibble: 8 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_bag_mlp    Prepro… rmse    0.552  0.0292    10 recipe       bag_…     1
2 recipe_bag_mlp    Prepro… rsq     0.782  0.0270    10 recipe       bag_…     1
3 recipe_rand_fore… Prepro… rmse    0.564  0.0255    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.770  0.0259    10 recipe       rand…     2
5 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     3
6 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     3
7 recipe_boost_tree Prepro… rmse    0.600  0.0289    10 recipe       boos…     4
8 recipe_boost_tree Prepro… rsq     0.745  0.0268    10 recipe       boos…     4

wf <- workflow_set(list(rec), list(lm_model, rf_model, nn_model, xg_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv)

The bagged MLP, neural network model appears to be the best with a mean r-sq of ~0.78 and a rmse of 0.57. The rsme is not the lowest but the r-sq is better than the rand forest at ~0.77. If rsme was a priority I would go with the rf model.

Question 4 Deliverable

# Set seed
set.seed(6515)
camels <- camels %>% 
  mutate(logQmean = log(q_mean))   # Add logQmean column to df

# Data Splitting
## Generate split (75/25)
camels_split <- initial_split(camels, prop = 0.75)
  ## Extract training and testing sets
  camels_tr <- training(camels_split)
  camels_te  <- testing(camels_split)
  ## 10-fold CV dataset
  camels_10cv <- vfold_cv(camels_tr, v = 10)
  
# Recipe Vars Review
  # Check for skewing
camels_tr %>%
  select(pet_mean, p_mean, runoff_ratio, baseflow_index, aridity, slope_mean, area_geospa_fabric) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal()

Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).

# Recipie
alt_rec <- recipe(logQmean ~ pet_mean + p_mean + aridity + runoff_ratio + baseflow_index + slope_mean + area_geospa_fabric, data = camels_tr) %>% 
  step_YeoJohnson(all_predictors()) %>% 
  step_interact(terms = ~ pet_mean:p_mean + aridity:runoff_ratio + area_geospa_fabric:slope_mean) %>% 
  step_corr(all_predictors(), threshold = 0.9) %>%   # Remove highly correlated predictors to avoid multicollinearity.
  step_normalize(all_predictors()) %>% 
  step_naomit(all_predictors(), all_outcomes())
  
# Define and Train Models
  ## Define rf model
  rf_alt_model <- rand_forest() %>% 
    set_engine("ranger") %>% 
    set_mode("regression")
  
  rf_alt_wf <- workflow() %>%
    # Add the recipe
    add_recipe(alt_rec) %>%
    # Add the model
    add_model(rf_alt_model) %>%
    # Fit the model
    fit(data = camels_tr)
   
  rf_predictions <- augment(rf_alt_wf, new_data = camels_te) 

  ## Define xg model
  xg_alt_model <- boost_tree() %>% 
    set_engine("xgboost") %>% 
    set_mode("regression")
  
  xg_alt_wf <- workflow() %>%
    # Add the recipe
    add_recipe(alt_rec) %>%
    # Add the model
    add_model(xg_alt_model) %>%
    # Fit the model
    fit(data = camels_tr)
  
  xg_predictions <- augment(xg_alt_wf, new_data = camels_te)
  
  ## Define nueral net model
  nn_alt_model <- bag_mlp() %>% 
    set_engine("nnet") %>% 
    set_mode("regression")
  
  nn_alt_wf <- workflow() %>%
    # Add the recipe
    add_recipe(alt_rec) %>%
    # Add the model
    add_model(nn_alt_model) %>%
    # Fit the model
    fit(data = camels_tr)
  
  nn_predictions <- augment(nn_alt_wf, new_data = camels_te)
  
  ## Define linear reg model
  lm_alt_model <- linear_reg() %>% 
    set_engine("lm") %>% 
    set_mode("regression")
  
  lm_alt_wf <- workflow() %>%
    # Add the recipe
    add_recipe(alt_rec) %>%
    # Add the model
    add_model(lm_alt_model) %>%
    # Fit the model
    fit(data = camels_tr)
  
  lm_predictions <- augment(lm_alt_wf, new_data = camels_te) 
  
  ## Define SVM-nonlinear model
  svm_alt_model <- svm_rbf() %>% 
    set_engine("kernlab") %>% 
    set_mode("regression")

  svm_alt_wf <- workflow() %>%
    # Add the recipe
    add_recipe(alt_rec) %>%
    # Add the model
    add_model(svm_alt_model) %>%
    # Fit the model
    fit(data = camels_tr)  
  
  svm_predictions <- augment(svm_alt_wf, new_data = camels_te)
  
 # Implement workflowset analysis
  
  alt_wf_set <- workflow_set(preproc = list(rec),
                          models = list(rf = rf_alt_model, 
                                        xg = xg_alt_model, 
                                        nn = nn_alt_model, 
                                        lm = lm_alt_model, 
                                        svm = svm_alt_model)) %>%
  workflow_map('fit_resamples', resamples = camels_10cv) 
  
autoplot(alt_wf_set)

rank_results(alt_wf_set, rank_metric = "rsq", select_best = TRUE)

# A tibble: 10 × 9
   wflow_id   .config       .metric  mean std_err     n preprocessor model  rank
   <chr>      <chr>         <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
 1 recipe_nn  Preprocessor… rmse    0.565  0.0347    10 recipe       bag_…     1
 2 recipe_nn  Preprocessor… rsq     0.771  0.0149    10 recipe       bag_…     1
 3 recipe_lm  Preprocessor… rmse    0.571  0.0426    10 recipe       line…     2
 4 recipe_lm  Preprocessor… rsq     0.765  0.0222    10 recipe       line…     2
 5 recipe_svm Preprocessor… rmse    0.561  0.0414    10 recipe       svm_…     3
 6 recipe_svm Preprocessor… rsq     0.760  0.0204    10 recipe       svm_…     3
 7 recipe_rf  Preprocessor… rmse    0.576  0.0397    10 recipe       rand…     4
 8 recipe_rf  Preprocessor… rsq     0.754  0.0188    10 recipe       rand…     4
 9 recipe_xg  Preprocessor… rmse    0.610  0.0371    10 recipe       boos…     5
10 recipe_xg  Preprocessor… rsq     0.724  0.0161    10 recipe       boos…     5

# Moving forward with the NN Model

  ## Extract the model coefficients
  nn_coeff <- coef(nn_alt_model)  
  nn_coeff

NULL

  ## Use the data to make predictions
  metrics(nn_predictions, truth = logQmean, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.0615
2 rsq     standard      0.998 
3 mae     standard      0.0111

  ggplot(nn_predictions, aes(x = logQmean, y = .pred)) +
    geom_point(aes(color = .pred), size = 2) +
    scale_color_gradient(low = "tan", high = "royalblue") +
    labs(title = "Observed vs Predicted Values with the NN Model",
         x = "Observed Log Mean Flow",
         y = "Predicted Log Mean Flow",
         color = "Aridity") +
    geom_abline(linetype = 2) +
    theme_linedraw()

Q4b: I chose a complex formula to attempt to compute multiple elements of the watershed. Temperature (pet), precipitation (p) and aridity are all related as is runoff, slope and catchment area (area_geo…). Finally I included baseflow index because it seems like predicting flow will be challenging without first understanding what water is already there from groundwater sources, other inputs, etc.

Q4c: I used selected the above models in an attempt to find non-linear representations for the data. Given the complexity of the formula I made for my recipe I anticipated the data fitting linear models poorly.

Q4e: For the recipe I created, the bag_mlp neural network model performed the best. It had the highest mean r-squared (~0.771) and the second lowest root mean standard error (rmse) (~0.565). The SVM model had a marginally lower rmse (~0.561) but also had a lower mean r-squared (~0.0760). If I really wanted to I could tune the SVM and NN models to optimize them and then compare them again.

Q4f: I am very happy with the results, outside of a few outliers, my models seems to have improve on the performance of the recipe and models from part 3.