# Zachary Cramton# 25 March 2025# ESS 330 - Daily Assignment 15/16# This daily assignment uses the penguins dataset to practice seeding, splitting and cross validation.# Load necessary libraries and datalibrary("tidymodels")
# Set the seedset.seed(101991) # Random seed borrowed from lecture#Extract the training and testing sets with 70/30 split stratified by species.penguins_strata <-initial_split(penguins, strata = species, prop =0.7)# Check test split100/333
# ---- Day 16: Model Fitting and Workflow ----# Define logistic regression modellog_reg_model <-multinom_reg() %>%set_engine("nnet") %>%set_mode("classification")# Define random forest modelrand_forest_model <-rand_forest() %>%set_engine("ranger") %>%# Default for rand_forest() modelset_mode("classification")# Define recipe for pre-processingpenguin_recipe <-recipe(species ~., data = penguins_train) %>%step_normalize(all_numeric_predictors()) # Normalizes predictor vars for mean = 0 and variance = 1# Create workflows to keep models and recipes (preprocessing) together.log_ref_wf <-workflow() %>%add_model(log_reg_model) %>%# Adds log_reg_model to the wfadd_recipe(penguin_recipe) # Adds normalization reciperand_forest_wf <-workflow() %>%add_model(rand_forest_model) %>%# Adds rand_forest model to the wfadd_recipe(penguin_recipe) # Adds normalization recipe# Compare models with a workflow setmodel_set <-workflow_set(preproc =list(recipe = penguin_recipe), # Apply the same preprocessing (normalization) recipe to all modelsmodels =list(logistic_regression = log_reg_model, random_forest = rand_forest_model))# Fit models using resamplingset.seed(34215) # Randomly selected seedresults <- model_set %>%workflow_map("fit_resamples", resamples = penguin_folds, metrics =metric_set(accuracy)) # Fit models using cross validation# Rank models by accuracyranked_results <-rank_results(results, rank_metric ="accuracy")print(ranked_results) # Print results
# Comment: # Using a standard logistic_reg model the mean accuracy was ~0.638 with poor stability. # The rand_forest model was far superior in this case.# As the logistic_reg model is set up for binary outcomes not multiclass classification,# I used a different log_reg_model type.# The multinom_reg model allows for multiclass classificiation with the nnet engine # handling more than two classes.# After switching to the multinom_reg model, it outperformed the random_forest # model by ~0.004 in mean accuracy making it the better model for accuracy.# However, the random_forest model outperformed the multinom_reg model by ~0.002 # in accuracy std. error making the random_forest model the better choice for stability.