Daily Exercise 16: Building a ML Workflow

ESS 330 - Quantitative Reasoning

Author

# Zachary Cramton
# 25 March 2025
# ESS 330 - Daily Assignment 15/16
# This daily assignment uses the penguins dataset to practice seeding, splitting and cross validation.

# Load necessary libraries and data
library("tidymodels")

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

✔ broom        1.0.7     ✔ recipes      1.1.0
✔ dials        1.3.0     ✔ rsample      1.2.1
✔ dplyr        1.1.4     ✔ tibble       3.2.1
✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
✔ infer        1.0.7     ✔ tune         1.2.1
✔ modeldata    1.4.0     ✔ workflows    1.1.4
✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
✔ purrr        1.0.2     ✔ yardstick    1.3.2

Warning: package 'scales' was built under R version 4.4.3

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages

library("rsample")
library("palmerpenguins")


Attaching package: 'palmerpenguins'

The following object is masked from 'package:modeldata':

    penguins

library("ranger")

Warning: package 'ranger' was built under R version 4.4.3

data("penguins")

# ---- Exercise 15: Data Cleaning and Splitting ---

# Clean and view the penguins df
penguins <- drop_na(penguins)
str(penguins)

tibble [333 × 8] (S3: tbl_df/tbl/data.frame)
 $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ bill_length_mm   : num [1:333] 39.1 39.5 40.3 36.7 39.3 38.9 39.2 41.1 38.6 34.6 ...
 $ bill_depth_mm    : num [1:333] 18.7 17.4 18 19.3 20.6 17.8 19.6 17.6 21.2 21.1 ...
 $ flipper_length_mm: int [1:333] 181 186 195 193 190 181 195 182 191 198 ...
 $ body_mass_g      : int [1:333] 3750 3800 3250 3450 3650 3625 4675 3200 3800 4400 ...
 $ sex              : Factor w/ 2 levels "female","male": 2 1 1 1 2 1 2 1 2 2 ...
 $ year             : int [1:333] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...

glimpse(penguins)

Rows: 333
Columns: 8
$ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
$ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex               <fct> male, female, female, female, male, female, male, fe…
$ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

# Set the seed
set.seed(101991) # Random seed borrowed from lecture

#Extract the training and testing sets with 70/30 split stratified by species.
penguins_strata <- initial_split(penguins, strata = species, prop = 0.7)

# Check test split
100/333

[1] 0.3003003

# Extract split data
penguins_train <- training(penguins_strata)
penguins_test <- testing(penguins_strata)

# Print tibbles for train/test data
penguins_train

# A tibble: 232 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           39.3          20.6               190        3650
 5 Adelie  Torgersen           39.2          19.6               195        4675
 6 Adelie  Torgersen           41.1          17.6               182        3200
 7 Adelie  Torgersen           38.6          21.2               191        3800
 8 Adelie  Torgersen           36.6          17.8               185        3700
 9 Adelie  Torgersen           38.7          19                 195        3450
10 Adelie  Torgersen           42.5          20.7               197        4500
# ℹ 222 more rows
# ℹ 2 more variables: sex <fct>, year <int>

penguins_test

# A tibble: 101 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           36.7          19.3               193        3450
 2 Adelie  Torgersen           38.9          17.8               181        3625
 3 Adelie  Torgersen           34.6          21.1               198        4400
 4 Adelie  Biscoe              37.8          18.3               174        3400
 5 Adelie  Biscoe              37.7          18.7               180        3600
 6 Adelie  Biscoe              38.8          17.2               180        3800
 7 Adelie  Dream               37.2          18.1               178        3900
 8 Adelie  Dream               39.2          21.1               196        4150
 9 Adelie  Dream               40.8          18.4               195        3900
10 Adelie  Biscoe              40.1          18.9               188        4300
# ℹ 91 more rows
# ℹ 2 more variables: sex <fct>, year <int>

# Resampling
set.seed(3214) # Set resampling seed with random number from the lecture

# Conduct 10-Fold Cross-Validation
nrow(penguins_train) * 1/10

[1] 23.2

vfold_cv(penguins_train, v = 10)

#  10-fold cross-validation 
# A tibble: 10 × 2
   splits           id    
   <list>           <chr> 
 1 <split [208/24]> Fold01
 2 <split [208/24]> Fold02
 3 <split [209/23]> Fold03
 4 <split [209/23]> Fold04
 5 <split [209/23]> Fold05
 6 <split [209/23]> Fold06
 7 <split [209/23]> Fold07
 8 <split [209/23]> Fold08
 9 <split [209/23]> Fold09
10 <split [209/23]> Fold10

penguin_folds <- vfold_cv(penguins_train)
penguin_folds$splits[1:3]

[[1]]
<Analysis/Assess/Total>
<208/24/232>

[[2]]
<Analysis/Assess/Total>
<208/24/232>

[[3]]
<Analysis/Assess/Total>
<209/23/232>

# ---- Day 16: Model Fitting and Workflow ----

# Define logistic regression model
log_reg_model <- multinom_reg() %>% 
  set_engine("nnet") %>% 
  set_mode("classification")

# Define random forest model
rand_forest_model <- rand_forest() %>% 
  set_engine("ranger") %>%   # Default for rand_forest() model
  set_mode("classification")

# Define recipe for pre-processing
penguin_recipe <- recipe(species ~., data = penguins_train) %>% 
  step_normalize(all_numeric_predictors())   # Normalizes predictor vars for mean = 0 and variance = 1

# Create workflows to keep models and recipes (preprocessing) together.
log_ref_wf <- workflow() %>% 
  add_model(log_reg_model) %>%    # Adds log_reg_model to the wf
  add_recipe(penguin_recipe)  # Adds normalization recipe

rand_forest_wf <- workflow() %>% 
  add_model(rand_forest_model) %>% # Adds rand_forest model to the wf
  add_recipe(penguin_recipe)   # Adds normalization recipe

# Compare models with a workflow set
model_set <- workflow_set(
  preproc = list(recipe = penguin_recipe),   # Apply the same preprocessing (normalization) recipe to all models
  models = list(logistic_regression = log_reg_model, 
                random_forest = rand_forest_model)
)

# Fit models using resampling
set.seed(34215)   # Randomly selected seed
results <- model_set %>% 
  workflow_map("fit_resamples", 
               resamples = penguin_folds, 
               metrics = metric_set(accuracy))   # Fit models using cross validation

# Rank models by accuracy
ranked_results <- rank_results(results, rank_metric = "accuracy")
print(ranked_results)   # Print results

# A tibble: 2 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_logistic_… Prepro… accura… 0.987 0.00922    10 recipe       mult…     1
2 recipe_random_fo… Prepro… accura… 0.983 0.00710    10 recipe       rand…     2

# Comment: 
# Using a standard logistic_reg model the mean accuracy was ~0.638 with poor stability. 
  # The rand_forest model was far superior in this case.
# As the logistic_reg model is set up for binary outcomes not multiclass classification,
  # I used a different log_reg_model type.
# The multinom_reg model allows for multiclass classificiation with the nnet engine 
  # handling more than two classes.

# After switching to the multinom_reg model, it outperformed the random_forest 
  # model by ~0.004 in mean accuracy making it the better model for accuracy.
# However, the random_forest model outperformed the multinom_reg model by ~0.002 
  # in accuracy std. error making the random_forest model the better choice for stability.