Warning: package 'vip' was built under R version 4.4.3
Attaching package: 'vip'
The following object is masked from 'package:utils':
vi
library(baguette)
Warning: package 'baguette' was built under R version 4.4.3
# Data Import/Tidy/Transform root <-'https://gdex.ucar.edu/dataset/camels/file'download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 'data/camels_attributes_v2.0.pdf')
Warning in
download.file("https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf",
: URL https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf:
cannot open destfile 'data/camels_attributes_v2.0.pdf', reason 'No such file or
directory'
Warning in
download.file("https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf",
: download had nonzero exit status
types <-c("clim", "geol", "soil", "topo", "vege", "hydro")# Where the files live online ...remote_files <-glue('{root}/camels_{types}.txt')# where we want to download the data ...local_files <-glue('../data/lab_data/camels_hydro_data/camels_{types}.txt')walk2(remote_files, local_files, download.file, quiet =TRUE)# Read and merge datacamels <-map(local_files, read_delim, show_col_types =FALSE) camels <-power_full_join(camels ,by ='gauge_id') # Add log(q_mean) to dfcamels <- camels %>%mutate(logQmean =log(q_mean)) %>%mutate(across(everything(), as.double))
Warning: There were 5 warnings in `mutate()`.
The first warning was:
ℹ In argument: `across(everything(), as.double)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run `dplyr::last_dplyr_warnings()` to see the 4 remaining warnings.
skim(camels)
Data summary
Name
camels
Number of rows
671
Number of columns
59
_______________________
Column type frequency:
numeric
59
________________________
Group variables
None
Variable type: numeric
skim_variable
n_missing
complete_rate
mean
sd
p0
p25
p50
p75
p100
hist
gauge_id
0
1.00
6265830.84
3976867.52
1013500.00
2370650.00
6278300.00
9382765.00
14400000.00
▇▃▅▃▃
p_mean
0
1.00
3.26
1.41
0.64
2.37
3.23
3.78
8.94
▃▇▂▁▁
pet_mean
0
1.00
2.79
0.55
1.90
2.34
2.69
3.15
4.74
▇▇▅▂▁
p_seasonality
0
1.00
-0.04
0.53
-1.44
-0.26
0.08
0.22
0.92
▁▂▃▇▂
frac_snow
0
1.00
0.18
0.20
0.00
0.04
0.10
0.22
0.91
▇▂▁▁▁
aridity
0
1.00
1.06
0.62
0.22
0.70
0.86
1.27
5.21
▇▂▁▁▁
high_prec_freq
0
1.00
20.93
4.55
7.90
18.50
22.00
24.23
32.70
▂▃▇▇▁
high_prec_dur
0
1.00
1.35
0.19
1.08
1.21
1.28
1.44
2.09
▇▅▂▁▁
high_prec_timing
671
0.00
NaN
NA
NA
NA
NA
NA
NA
low_prec_freq
0
1.00
254.65
35.12
169.90
232.70
255.85
278.92
348.70
▂▅▇▅▁
low_prec_dur
0
1.00
5.95
3.20
2.79
4.24
4.95
6.70
36.51
▇▁▁▁▁
low_prec_timing
671
0.00
NaN
NA
NA
NA
NA
NA
NA
geol_1st_class
671
0.00
NaN
NA
NA
NA
NA
NA
NA
glim_1st_class_frac
0
1.00
0.79
0.20
0.30
0.61
0.83
1.00
1.00
▁▃▃▃▇
geol_2nd_class
671
0.00
NaN
NA
NA
NA
NA
NA
NA
glim_2nd_class_frac
0
1.00
0.16
0.14
0.00
0.00
0.14
0.27
0.49
▇▃▃▂▁
carbonate_rocks_frac
0
1.00
0.12
0.26
0.00
0.00
0.00
0.04
1.00
▇▁▁▁▁
geol_porostiy
3
1.00
0.13
0.07
0.01
0.07
0.13
0.19
0.28
▇▆▇▇▂
geol_permeability
0
1.00
-13.89
1.18
-16.50
-14.77
-13.96
-13.00
-10.90
▂▅▇▅▂
soil_depth_pelletier
0
1.00
10.87
16.24
0.27
1.00
1.23
12.89
50.00
▇▁▁▁▁
soil_depth_statsgo
0
1.00
1.29
0.27
0.40
1.11
1.46
1.50
1.50
▁▁▂▂▇
soil_porosity
0
1.00
0.44
0.02
0.37
0.43
0.44
0.46
0.68
▃▇▁▁▁
soil_conductivity
0
1.00
1.74
1.52
0.45
0.93
1.35
1.93
13.96
▇▁▁▁▁
max_water_content
0
1.00
0.53
0.15
0.09
0.43
0.56
0.64
1.05
▁▅▇▃▁
sand_frac
0
1.00
36.47
15.63
8.18
25.44
35.27
44.46
91.98
▅▇▅▁▁
silt_frac
0
1.00
33.86
13.25
2.99
23.95
34.06
43.64
67.77
▂▆▇▆▁
clay_frac
0
1.00
19.89
9.32
1.85
14.00
18.66
25.42
50.35
▃▇▅▂▁
water_frac
0
1.00
0.10
0.94
0.00
0.00
0.00
0.00
19.35
▇▁▁▁▁
organic_frac
0
1.00
0.59
3.84
0.00
0.00
0.00
0.00
57.86
▇▁▁▁▁
other_frac
0
1.00
9.82
16.83
0.00
0.00
1.31
11.74
99.38
▇▁▁▁▁
gauge_lat
0
1.00
39.24
5.21
27.05
35.70
39.25
43.21
48.82
▂▃▇▆▅
gauge_lon
0
1.00
-95.79
16.21
-124.39
-110.41
-92.78
-81.77
-67.94
▆▃▇▇▅
elev_mean
0
1.00
759.42
786.00
10.21
249.67
462.72
928.88
3571.18
▇▂▁▁▁
slope_mean
0
1.00
46.20
47.12
0.82
7.43
28.80
73.17
255.69
▇▂▂▁▁
area_gages2
0
1.00
792.62
1701.95
4.03
122.28
329.68
794.30
25791.04
▇▁▁▁▁
area_geospa_fabric
0
1.00
808.08
1709.85
4.10
127.98
340.70
804.50
25817.78
▇▁▁▁▁
frac_forest
0
1.00
0.64
0.37
0.00
0.28
0.81
0.97
1.00
▃▁▁▂▇
lai_max
0
1.00
3.22
1.52
0.37
1.81
3.37
4.70
5.58
▅▆▃▅▇
lai_diff
0
1.00
2.45
1.33
0.15
1.20
2.34
3.76
4.83
▇▇▇▆▇
gvf_max
0
1.00
0.72
0.17
0.18
0.61
0.78
0.86
0.92
▁▁▂▃▇
gvf_diff
0
1.00
0.32
0.15
0.03
0.19
0.32
0.46
0.65
▃▇▅▇▁
dom_land_cover_frac
0
1.00
0.81
0.18
0.31
0.65
0.86
1.00
1.00
▁▂▃▃▇
dom_land_cover
671
0.00
NaN
NA
NA
NA
NA
NA
NA
root_depth_50
24
0.96
0.18
0.03
0.12
0.17
0.18
0.19
0.25
▃▃▇▂▂
root_depth_99
24
0.96
1.83
0.30
1.50
1.52
1.80
2.00
3.10
▇▃▂▁▁
q_mean
1
1.00
1.49
1.54
0.00
0.63
1.13
1.75
9.69
▇▁▁▁▁
runoff_ratio
1
1.00
0.39
0.23
0.00
0.24
0.35
0.51
1.36
▆▇▂▁▁
slope_fdc
1
1.00
1.24
0.51
0.00
0.90
1.28
1.63
2.50
▂▅▇▇▁
baseflow_index
0
1.00
0.49
0.16
0.01
0.40
0.50
0.60
0.98
▁▃▇▅▁
stream_elas
1
1.00
1.83
0.78
-0.64
1.32
1.70
2.23
6.24
▁▇▃▁▁
q5
1
1.00
0.17
0.27
0.00
0.01
0.08
0.22
2.42
▇▁▁▁▁
q95
1
1.00
5.06
4.94
0.00
2.07
3.77
6.29
31.82
▇▂▁▁▁
high_q_freq
1
1.00
25.74
29.07
0.00
6.41
15.10
35.79
172.80
▇▂▁▁▁
high_q_dur
1
1.00
6.91
10.07
0.00
1.82
2.85
7.55
92.56
▇▁▁▁▁
low_q_freq
1
1.00
107.62
82.24
0.00
37.44
96.00
162.14
356.80
▇▆▅▂▁
low_q_dur
1
1.00
22.28
21.66
0.00
10.00
15.52
26.91
209.88
▇▁▁▁▁
zero_q_freq
1
1.00
0.03
0.11
0.00
0.00
0.00
0.00
0.97
▇▁▁▁▁
hfd_mean
1
1.00
182.52
33.53
112.25
160.16
173.77
204.05
287.75
▂▇▃▂▁
logQmean
1
1.00
-0.11
1.17
-5.39
-0.46
0.12
0.56
2.27
▁▁▂▇▂
vis_dat(camels)
# Set seedset.seed(567)# Generate the splitcamels_split <-initial_split(camels, prop =0.8)camels_tr <-training(camels_split)camels_te <-testing(camels_split)# Cross-validation foldscamels_10cv <-vfold_cv(camels_tr, v =10)# Reciperec <-recipe(logQmean ~ pet_mean + p_mean + aridity + runoff_ratio + baseflow_index + slope_mean + area_geospa_fabric, data = camels_tr) %>%step_YeoJohnson(all_predictors()) %>%step_interact(terms =~ pet_mean:p_mean + aridity:runoff_ratio + area_geospa_fabric:slope_mean) %>%step_corr(all_predictors(), threshold =0.9) %>%# Remove highly correlated predictors to avoid multicollinearity.step_normalize(all_predictors()) %>%step_naomit(all_predictors(), all_outcomes())# Define and Train Models## Define rf model rf_model <-rand_forest() %>%set_engine("ranger") %>%set_mode("regression") rf_wf <-workflow() %>%# Add the recipeadd_recipe(rec) %>%# Add the modeladd_model(rf_model) %>%# Fit the modelfit(data = camels_tr) rf_predictions <-augment(rf_wf, new_data = camels_te) ## Define xg model xg_model <-boost_tree() %>%set_engine("xgboost") %>%set_mode("regression") xg_wf <-workflow() %>%# Add the recipeadd_recipe(rec) %>%# Add the modeladd_model(xg_model) %>%# Fit the modelfit(data = camels_tr) xg_predictions <-augment(xg_wf, new_data = camels_te)## Define nueral net model nn_model <-bag_mlp() %>%set_engine("nnet") %>%set_mode("regression") nn_wf <-workflow() %>%# Add the recipeadd_recipe(rec) %>%# Add the modeladd_model(nn_model) %>%# Fit the modelfit(data = camels_tr) nn_predictions <-augment(nn_wf, new_data = camels_te)## Define linear reg model lm_model <-linear_reg() %>%set_engine("lm") %>%set_mode("regression") lm_wf <-workflow() %>%# Add the recipeadd_recipe(rec) %>%# Add the modeladd_model(lm_model) %>%# Fit the modelfit(data = camels_tr) lm_predictions <-augment(lm_wf, new_data = camels_te) # Implement workflowset analysis ml_wf_set <-workflow_set(preproc =list(rec),models =list(rf = rf_model, xg = xg_model, nn = nn_model, lm = lm_model)) %>%workflow_map('fit_resamples', resamples = camels_10cv)
Warning: package 'ranger' was built under R version 4.4.3
# A tibble: 60 × 8
hidden_units penalty .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 0.000000144 mae standard 0.104 10 0.00738 Preproces…
2 1 0.000000144 rmse standard 0.173 10 0.0246 Preproces…
3 1 0.000000144 rsq standard 0.980 10 0.00397 Preproces…
4 1 0.0000616 mae standard 0.109 10 0.00715 Preproces…
5 1 0.0000616 rmse standard 0.174 10 0.0242 Preproces…
6 1 0.0000616 rsq standard 0.979 10 0.00393 Preproces…
7 2 0.0264 mae standard 0.0399 10 0.00453 Preproces…
8 2 0.0264 rmse standard 0.0824 10 0.0205 Preproces…
9 2 0.0264 rsq standard 0.995 10 0.00191 Preproces…
10 2 0.00000000113 mae standard 0.0339 10 0.00442 Preproces…
# ℹ 50 more rows
best_mae <-show_best(model_params, metric ="mae", n =1)hp_best <-select_best(model_params, metric ="mae")#> The first row shows the mean MAE across resamples, SE of the MAE estimate, # of resamples, and mean SE. Penalty is the best hyperparameter set for this model. final_wf <-finalize_workflow(wf_tune, hp_best)final_fit <-last_fit(final_wf, split = camels_split)final_metrics <-collect_metrics(final_fit)# The final model's rmse 0.010 and the rsq is 0.999. This means that 99.9% of the variance is explained by the model. This is an excellent number and result. The rmse is the average prediction error, and this percentage is ~1% which is quite good. This model is very good, but may be less efficient than more simple models which are less demanding computationally with worse but still acceptable values for rsq and rmse.predictions <-collect_predictions(model_params)ggplot(predictions, aes(x = .pred, y = logQmean)) +geom_smooth(method = lm, color ="blue") +geom_abline(slope =1, intercept =0, linetype ="dashed") +scale_color_gradient() +labs(title ="Actual vs. Predicted Values", x ="Predicted", y ="Actual")
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).