# 25 March 2025
# This daily assignment uses the penguins dataset to practice seeding, splitting and cross validation.
# Load necessary libraries and data
library ("tidymodels" )
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom 1.0.7 ✔ recipes 1.1.0
✔ dials 1.3.0 ✔ rsample 1.2.1
✔ dplyr 1.1.4 ✔ tibble 3.2.1
✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
✔ infer 1.0.7 ✔ tune 1.2.1
✔ modeldata 1.4.0 ✔ workflows 1.1.4
✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
✔ purrr 1.0.2 ✔ yardstick 1.3.2
Warning: package 'scales' was built under R version 4.4.3
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ recipes::step() masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages
library ("rsample" )
library ("palmerpenguins" )
Attaching package: 'palmerpenguins'
The following object is masked from 'package:modeldata':
penguins
data ("penguins" )
# Clean and view the penguins df
penguins <- drop_na (penguins)
str (penguins)
tibble [333 × 8] (S3: tbl_df/tbl/data.frame)
$ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
$ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
$ bill_length_mm : num [1:333] 39.1 39.5 40.3 36.7 39.3 38.9 39.2 41.1 38.6 34.6 ...
$ bill_depth_mm : num [1:333] 18.7 17.4 18 19.3 20.6 17.8 19.6 17.6 21.2 21.1 ...
$ flipper_length_mm: int [1:333] 181 186 195 193 190 181 195 182 191 198 ...
$ body_mass_g : int [1:333] 3750 3800 3250 3450 3650 3625 4675 3200 3800 4400 ...
$ sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 1 2 1 2 2 ...
$ year : int [1:333] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
Rows: 333
Columns: 8
$ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
$ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
$ bill_length_mm <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
$ body_mass_g <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex <fct> male, female, female, female, male, female, male, fe…
$ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
# Set the seed
set.seed (101991 ) # Random seed borrowed from lecture
#Extract the training and testing sets with 70/30 split stratified by species.
penguins_strata <- initial_split (penguins, strata = species, prop = 0.7 )
# Check test split
100 / 333
# Extract split data
penguins_train <- training (penguins_strata)
penguins_test <- testing (penguins_strata)
# Print tibbles for train/test data
penguins_train
# A tibble: 232 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 39.3 20.6 190 3650
5 Adelie Torgersen 39.2 19.6 195 4675
6 Adelie Torgersen 41.1 17.6 182 3200
7 Adelie Torgersen 38.6 21.2 191 3800
8 Adelie Torgersen 36.6 17.8 185 3700
9 Adelie Torgersen 38.7 19 195 3450
10 Adelie Torgersen 42.5 20.7 197 4500
# ℹ 222 more rows
# ℹ 2 more variables: sex <fct>, year <int>
# A tibble: 101 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 36.7 19.3 193 3450
2 Adelie Torgersen 38.9 17.8 181 3625
3 Adelie Torgersen 34.6 21.1 198 4400
4 Adelie Biscoe 37.8 18.3 174 3400
5 Adelie Biscoe 37.7 18.7 180 3600
6 Adelie Biscoe 38.8 17.2 180 3800
7 Adelie Dream 37.2 18.1 178 3900
8 Adelie Dream 39.2 21.1 196 4150
9 Adelie Dream 40.8 18.4 195 3900
10 Adelie Biscoe 40.1 18.9 188 4300
# ℹ 91 more rows
# ℹ 2 more variables: sex <fct>, year <int>
# Resampling
set.seed (3214 ) # Set resampling seed with random number from the lecture
# Conduct 10-Fold Cross-Validation
nrow (penguins_train) * 1 / 10
vfold_cv (penguins_train, v = 10 )
# 10-fold cross-validation
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [208/24]> Fold01
2 <split [208/24]> Fold02
3 <split [209/23]> Fold03
4 <split [209/23]> Fold04
5 <split [209/23]> Fold05
6 <split [209/23]> Fold06
7 <split [209/23]> Fold07
8 <split [209/23]> Fold08
9 <split [209/23]> Fold09
10 <split [209/23]> Fold10
penguin_folds <- vfold_cv (penguins_train)
penguin_folds$ splits[1 : 3 ]
[[1]]
<Analysis/Assess/Total>
<208/24/232>
[[2]]
<Analysis/Assess/Total>
<208/24/232>
[[3]]
<Analysis/Assess/Total>
<209/23/232>