Factoring in Class
haiti_bin <- haiti %>%
mutate(Class = if_else(Class == "Blue Tarp", "Blue Tarp", "Not Blue Tarp")) %>%
mutate(Class = factor(Class))
#### Check distribution
table(haiti_bin$Class)
##
## Blue Tarp Not Blue Tarp
## 2022 61219
summary(haiti_bin)
## Class Red Green Blue
## Blue Tarp : 2022 Min. : 48 Min. : 48.0 Min. : 44.0
## Not Blue Tarp:61219 1st Qu.: 80 1st Qu.: 78.0 1st Qu.: 63.0
## Median :163 Median :148.0 Median :123.0
## Mean :163 Mean :153.7 Mean :125.1
## 3rd Qu.:255 3rd Qu.:226.0 3rd Qu.:181.0
## Max. :255 Max. :255.0 Max. :255.0
glimpse(haiti_bin)
## Rows: 63,241
## Columns: 4
## $ Class <fct> Not Blue Tarp, Not Blue Tarp, Not Blue Tarp, Not Blue Tarp, Not …
## $ Red <dbl> 64, 64, 64, 75, 74, 72, 71, 69, 68, 67, 66, 64, 62, 82, 82, 79, …
## $ Green <dbl> 67, 67, 66, 82, 82, 76, 72, 70, 70, 70, 69, 67, 66, 85, 83, 80, …
## $ Blue <dbl> 50, 50, 49, 53, 54, 52, 51, 49, 49, 50, 50, 50, 50, 55, 54, 53, …
skimr::skim(haiti_bin)
Data summary
| Name |
haiti_bin |
| Number of rows |
63241 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
1 |
| numeric |
3 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| Class |
0 |
1 |
FALSE |
2 |
Not: 61219, Blu: 2022 |
Variable type: numeric
| Red |
0 |
1 |
162.98 |
78.88 |
48 |
80 |
163 |
255 |
255 |
▆▂▂▂▇ |
| Green |
0 |
1 |
153.66 |
72.64 |
48 |
78 |
148 |
226 |
255 |
▇▃▂▃▇ |
| Blue |
0 |
1 |
125.14 |
61.39 |
44 |
63 |
123 |
181 |
255 |
▇▂▃▆▁ |
colSums(is.na(haiti_bin))
## Class Red Green Blue
## 0 0 0 0
GGally::ggpairs(haiti, columns = c("Red", "Green", "Blue"))

haiti %>%
pivot_longer(cols = c(Red, Green, Blue)) %>%
ggplot(aes(x = value, fill = Class)) +
geom_density(alpha = 0.5) +
facet_wrap(~name, scales = "free") +
theme_minimal()

haiti %>%
pivot_longer(cols = c(Red, Green, Blue)) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
facet_wrap(~name, scales = "free") +
theme_minimal()

cor(haiti_bin %>% select(Red, Green, Blue))
## Red Green Blue
## Red 1.0000000 0.9815987 0.9355294
## Green 0.9815987 1.0000000 0.9648024
## Blue 0.9355294 0.9648024 1.0000000
Model Training
Setup CV Folds and Data
set.seed(43)
##small_holdout <- holdout_combined %>% sample_n(100000) ## Less Data
small_holdout<-holdout_combined ## Full Data
## Define CV folds on training data
cv_folds <- vfold_cv(train_data, v = 5, strata = Class)
metrics_class <- metric_set(accuracy, recall, precision, f_meas)
metrics_prob <- metric_set(roc_auc)
## Helper function to get class predictions with threshold
get_class_preds <- function(pred_probs, holdout_df,
class_name = "Blue Tarp", threshold = 0.5) {
pred_col <- paste0(".pred_", class_name) ## keep spaces as is
bind_cols(pred_probs, holdout_df) %>%
mutate(pred_class = if_else(.data[[pred_col]] > threshold,
class_name, paste("Not", class_name)))
}
Logistic
log_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
step_upsample(Class)
#### Model Building / Cross Val
log_spec <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
log_wf <- workflow() %>%
add_model(log_spec) %>%
add_recipe(log_rec)
log_fit_cv <- log_wf %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(accuracy, roc_auc),
control = control_resamples(save_pred = TRUE)
)
log_metrics <- collect_metrics(log_fit_cv) %>%
mutate(model = "Logistic Regression")
log_metrics %>% select(.metric, mean)
## Fit
log_final_fit <- fit(log_wf, data = train_data)
## Predict probabilities and class on holdout
log_probs <- predict(log_final_fit, small_holdout, type = "prob")
log_preds <- get_class_preds(log_probs, small_holdout, class_name = "Blue Tarp",
threshold = 0.5) %>%
mutate(
Class = factor(Class),
pred_class = factor(pred_class, levels = levels(Class))
)
log_metrics_holdout <- log_preds %>%
metrics_class(truth = Class, estimate = pred_class) %>%
mutate(Model = "Logistic Regression")
log_roc_auc_holdout <- log_probs %>%
bind_cols(Class = small_holdout$Class) %>%
metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
mutate(Model = "Logistic Regression")
## Combine metrics from threshold-based metrics and ROC AUC
log_metrics_combined <- log_metrics_holdout %>%
select(.metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
bind_cols(log_roc_auc_holdout %>% select(roc_auc = .estimate)) %>%
mutate(Model = "Logistic Regression") %>%
select(Model, accuracy, precision, recall, f_meas, roc_auc)
log_cm <- conf_mat(log_preds, truth = Class, estimate = pred_class)
log_cm %>%
autoplot(type = "heatmap") +
labs(title = "Confusion Matrix: Logistic Regression on Holdout Set") +
theme_minimal()

## Extract confusion matrix as a data frame
log_cm_table <- as_tibble(log_cm$table)
log_cm_table <- log_cm_table %>%
rename(
Truth = Truth,
Prediction = Prediction,
Count = n
)
## Display Heatmap Matrix
log_cm_matrix <- log_cm_table %>%
pivot_wider(names_from = Prediction, values_from = Count) %>%
rename(`Actual Class` = Truth)
## Tabular Matrix + Performance
kable(log_metrics_combined, digits = 3,
caption = "Logistic Regression Performance on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options =
c("striped", "hover", "condensed"))
Logistic Regression Performance on Holdout Set
|
Model
|
accuracy
|
precision
|
recall
|
f_meas
|
roc_auc
|
|
Logistic Regression
|
0.914
|
0.077
|
1
|
0.144
|
1
|
kable(log_cm_matrix, digits = 0,
caption =
"Confusion Matrix (Matrix View): Logistic Regression on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options =
c("striped", "hover", "condensed"))
Confusion Matrix (Matrix View): Logistic Regression on Holdout Set
|
Actual Class
|
Blue Tarp
|
Not Blue Tarp
|
|
Blue Tarp
|
14480
|
0
|
|
Not Blue Tarp
|
172760
|
1816937
|
QDA
qda_spec <- discrim_quad() %>%
set_engine("MASS") %>%
set_mode("classification")
#### Recipe with upsampling
qda_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
step_upsample(Class)
## WF
qda_wf <- workflow() %>%
add_model(qda_spec) %>%
add_recipe(qda_rec)
## CV
qda_fit_cv <- qda_wf %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(accuracy, roc_auc),
control = control_resamples(save_pred = TRUE)
)
## metrics
qda_metrics_cv <- collect_metrics(qda_fit_cv) %>%
mutate(Model = "QDA")
## show mean metrics
qda_metrics_cv %>%
dplyr::select(.metric, mean)
## Final fit on full training data
qda_final_fit <- fit(qda_wf, data = train_data)
## Threshold tuning on training data
qda_probs_train <- predict(qda_final_fit,
new_data = train_data, type = "prob") %>%
bind_cols(Class = train_data$Class)
qda_threshold_perf <- probably::threshold_perf(
qda_probs_train,
truth = Class,
estimate = `.pred_Blue Tarp`,
thresholds = seq(0.05, 0.95, by = 0.01),
metrics = metric_set(f_meas),
event_level = "first"
)
qda_best_threshold <- qda_threshold_perf %>%
arrange(desc(.estimate)) %>%
slice(1) %>%
pull(.threshold)
## Predict on holdout set
qda_probs_holdout <- predict(qda_final_fit, small_holdout, type = "prob")
qda_preds_holdout <- get_class_preds(qda_probs_holdout, small_holdout, class_name = "Blue Tarp", threshold = qda_best_threshold) %>%
mutate(
Class = factor(Class),
pred_class = factor(pred_class, levels = levels(Class))
)
## Calculate holdout metrics
qda_metrics_holdout <- qda_preds_holdout %>%
metrics_class(truth = Class, estimate = pred_class) %>%
mutate(Model = "QDA")
qda_roc_auc_holdout <- qda_probs_holdout %>%
bind_cols(Class = small_holdout$Class) %>%
metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
mutate(Model = "QDA")
## Combine holdout metrics and roc_auc
qda_metrics_combined <- qda_metrics_holdout %>%
dplyr::select(.metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
bind_cols(qda_roc_auc_holdout %>%
dplyr::select(roc_auc = .estimate)) %>%
mutate(Model = "QDA") %>%
dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)
## 10. Confusion matrix on holdout
qda_cm <- conf_mat(qda_preds_holdout, truth = Class, estimate = pred_class)
## Matrix heatmap
qda_cm %>%
autoplot(type = "heatmap") +
labs(title = "Confusion Matrix: QDA on Holdout Set") +
theme_minimal()

qda_cm_table <- as_tibble(qda_cm$table) %>%
rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
pivot_wider(names_from = Prediction, values_from = Count) %>%
rename(`Actual Class` = Truth)
## Display tables (metrics and confusion matrix)
kable(qda_metrics_combined, digits = 3, caption = "QDA Performance on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
QDA Performance on Holdout Set
|
Model
|
accuracy
|
precision
|
recall
|
f_meas
|
roc_auc
|
|
QDA
|
0.995
|
0.626
|
0.758
|
0.686
|
0.991
|
kable(qda_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): QDA on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Confusion Matrix (Matrix View): QDA on Holdout Set
|
Actual Class
|
Blue Tarp
|
Not Blue Tarp
|
|
Blue Tarp
|
10974
|
3506
|
|
Not Blue Tarp
|
6550
|
1983147
|
## Ensure training and holdout predictions include truth column
qda_probs_train <- predict(qda_final_fit, new_data = train_data, type = "prob") %>%
bind_cols(Class = train_data$Class)
qda_probs_holdout <- predict(qda_final_fit, new_data = small_holdout, type = "prob") %>%
bind_cols(Class = small_holdout$Class)
## ROC curve data
qda_roc_train <- roc_curve(qda_probs_train, truth = Class, `.pred_Blue Tarp`)
qda_roc_test <- roc_curve(qda_probs_holdout, truth = Class, `.pred_Blue Tarp`)
## Plot ROC curves
qda_roc_train_plot <- autoplot(qda_roc_train) +
labs(title = "QDA ROC - Training Set") +
theme_minimal()
qda_roc_test_plot <- autoplot(qda_roc_test) +
labs(title = "QDA ROC - Holdout Set") +
theme_minimal()
## Display plots side by side
qda_roc_train_plot | qda_roc_test_plot

qda_fmeasure_plot <- ggplot(qda_threshold_perf, aes(x = .threshold, y = .estimate)) +
geom_line(color = "steelblue") +
geom_point(data = filter(qda_threshold_perf, .threshold == qda_best_threshold), color = "red", size = 3) +
labs(title = "QDA - F-measure vs Threshold", x = "Threshold", y = "F-measure") +
theme_minimal()
print(qda_best_threshold)
## [1] 0.9
qda_fmeasure_plot

## KNN
knn_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
step_normalize(all_numeric_predictors())%>%
step_upsample(Class)
knn_wf <- workflow() %>%
add_model(
nearest_neighbor(neighbors = tune()) %>%
set_engine("kknn") %>%
set_mode("classification")
) %>%
add_recipe(knn_rec)
## Define search space for neighbors
## Define search space for neighbors
knn_params <- parameters(neighbors(range = c(1, 100)))
## Tune with Bayesian optimization
knn_tuned <- tune_bayes(
knn_wf,
resamples = cv_folds,
param_info = knn_params,
metrics = metric_set(roc_auc),
iter = 25,
control = control_bayes(no_improve = 5, verbose = TRUE)
)
## Plot tuning results
autoplot(knn_tuned)

## Select best model by ROC AUC
best_knn <- select_best(knn_tuned, metric = "roc_auc")
## Finalize workflow with best number of neighbors
final_knn_wf <- finalize_workflow(knn_wf, best_knn)
## Refit on full training data
knn_final_fit <- fit(final_knn_wf, data = train_data)
## Predict probabilities and classes on holdout
knn_probs <- predict(knn_final_fit, small_holdout, type = "prob")
thresholds <- tibble(threshold = seq(0.01, 0.99, by = 0.01))
threshold_eval <- thresholds %>%
mutate(
metrics = map(threshold, ~{
get_class_preds(knn_probs, small_holdout, class_name = "Blue Tarp", threshold = .x) %>%
mutate(
Class = factor(small_holdout$Class, levels = c("Blue Tarp", "Not Blue Tarp")),
pred_class = factor(pred_class, levels = c("Blue Tarp", "Not Blue Tarp"))
) %>%
metrics_class(truth = Class, estimate = pred_class)
})
) %>%
unnest(metrics)
best_threshold <- threshold_eval %>%
filter(.metric == "f_meas") %>%
arrange(desc(.estimate)) %>%
slice(1) %>%
pull(threshold)
knn_preds <- get_class_preds(knn_probs, small_holdout, class_name = "Blue Tarp", threshold = best_threshold) %>%
mutate(
Class = factor(Class, levels = c("Blue Tarp", "Not Blue Tarp")),
pred_class = factor(pred_class, levels = levels(Class))
)
## Confusion Matrix
knn_cm <- conf_mat(knn_preds, truth = Class, estimate = pred_class)
## Metrics
knn_metrics_holdout <- knn_preds %>%
metrics_class(truth = Class, estimate = pred_class) %>%
mutate(Model = "KNN")
knn_roc_auc_holdout <- knn_probs %>%
bind_cols(Class = small_holdout$Class) %>%
metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
mutate(Model = "KNN")
print(best_threshold)
## [1] 0.98
## F-measure vs threshold plot
ggplot(threshold_eval %>% filter(.metric == "f_meas"),
aes(x = threshold, y = .estimate)) +
geom_line(color = "steelblue") +
geom_point(data = filter(threshold_eval, .metric == "f_meas", threshold == best_threshold),
color = "red", size = 3) +
labs(title = "KNN - F-measure vs Threshold",
x = "Threshold", y = "F-measure") +
theme_minimal()

## Combine metrics
knn_metrics_combined <- knn_metrics_holdout %>%
dplyr::select(.metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
bind_cols(knn_roc_auc_holdout %>% dplyr::select(roc_auc = .estimate)) %>%
mutate(Model = "KNN") %>%
dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)
## Matrix heatmap
knn_cm %>%
autoplot(type = "heatmap") +
labs(title = "Confusion Matrix: KNN on Holdout Set") +
theme_minimal()

## Display Confusion Matrix Table
knn_cm_table <- as_tibble(knn_cm$table) %>%
rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
pivot_wider(names_from = Prediction, values_from = Count) %>%
rename(`Actual Class` = Truth)
## Display Tables
kable(knn_metrics_combined, digits = 3, caption = "KNN Performance on Holdout Set (Tuned)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
KNN Performance on Holdout Set (Tuned)
|
Model
|
accuracy
|
precision
|
recall
|
f_meas
|
roc_auc
|
|
KNN
|
0.983
|
0.274
|
0.782
|
0.406
|
0.945
|
kable(knn_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): KNN on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Confusion Matrix (Matrix View): KNN on Holdout Set
|
Actual Class
|
Blue Tarp
|
Not Blue Tarp
|
|
Blue Tarp
|
11322
|
3158
|
|
Not Blue Tarp
|
29982
|
1959715
|
Elastic Net
elastic_spec <- logistic_reg(
penalty = tune(), ## tunable
mixture = tune() ## tunable
) %>%
set_engine("glmnet") %>%
set_mode("classification")
## Upsample Recipe
elastic_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
step_upsample(Class)
## WF
elastic_wf <- workflow() %>%
add_model(elastic_spec) %>%
add_recipe(elastic_rec)
## Grid for tuning
elastic_grid <- grid_regular(
mixture(range = c(0, 1)),
penalty(range = c(-4, 0)), ## log10 scale: 10^-4 to 1
levels = 5
)
## CV
elastic_tuned <- tune_grid(
elastic_wf,
resamples = cv_folds,
grid = elastic_grid,
metrics = metric_set(roc_auc),
control = control_grid(save_pred = TRUE)
)
## best
best_params <- select_best(elastic_tuned, metric = "roc_auc")
## 7. Finalize workflow
final_elastic_wf <- finalize_workflow(elastic_wf, best_params)
elastic_final_fit <- fit(final_elastic_wf, data = train_data)
## 8. Predict probabilities on training data for threshold tuning
elastic_probs_train <- predict(elastic_final_fit, new_data = train_data, type = "prob") %>%
bind_cols(Class = train_data$Class)
## 9. Threshold tuning based on F-measure (like QDA)
elastic_threshold_perf <- probably::threshold_perf(
elastic_probs_train,
truth = Class,
estimate = `.pred_Blue Tarp`,
thresholds = seq(0.05, 0.95, by = 0.01),
metrics = metric_set(f_meas),
event_level = "first"
)
elastic_best_threshold <- elastic_threshold_perf %>%
arrange(desc(.estimate)) %>%
slice(1) %>%
pull(.threshold)
## Predict on holdout with tuned threshold
elastic_probs_holdout <- predict(elastic_final_fit, small_holdout, type = "prob")
elastic_preds_holdout <- get_class_preds(elastic_probs_holdout, small_holdout,
class_name = "Blue Tarp",
threshold = elastic_best_threshold) %>%
mutate(
Class = factor(Class),
pred_class = factor(pred_class, levels = levels(Class))
)
## Calculate holdout classification metrics
elastic_metrics_holdout <- elastic_preds_holdout %>%
metrics_class(truth = Class, estimate = pred_class) %>%
mutate(Model = "Elastic Net")
## Calculate ROC AUC on holdout
elastic_roc_auc_holdout <- elastic_probs_holdout %>%
bind_cols(Class = small_holdout$Class) %>%
metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
mutate(Model = "Elastic Net")
## Combine holdout metrics and roc_auc
elastic_metrics_combined <- elastic_metrics_holdout %>%
dplyr::select(.metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
bind_cols(elastic_roc_auc_holdout %>% dplyr::select(roc_auc = .estimate)) %>%
mutate(Model = "Elastic Net") %>%
dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)
## Confusion matrix on holdout
elastic_cm <- conf_mat(elastic_preds_holdout, truth = Class, estimate = pred_class)
## Plot confusion matrix heatmap
elastic_cm %>%
autoplot(type = "heatmap") +
labs(title = "Confusion Matrix: Elastic Net on Holdout Set") +
theme_minimal()

## Confusion matrix table for display
elastic_cm_table <- as_tibble(elastic_cm$table) %>%
rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
pivot_wider(names_from = Prediction, values_from = Count) %>%
rename(`Actual Class` = Truth)
## Display tables (metrics and confusion matrix)
kable(elastic_metrics_combined, digits = 3, caption = "Elastic Net Performance on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Elastic Net Performance on Holdout Set
|
Model
|
accuracy
|
precision
|
recall
|
f_meas
|
roc_auc
|
|
Elastic Net
|
0.958
|
0.146
|
1
|
0.254
|
1
|
kable(elastic_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): Elastic Net on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Confusion Matrix (Matrix View): Elastic Net on Holdout Set
|
Actual Class
|
Blue Tarp
|
Not Blue Tarp
|
|
Blue Tarp
|
14474
|
6
|
|
Not Blue Tarp
|
84874
|
1904823
|
##Plot ROC curves (Train vs Holdout)
## Training ROC curve
elastic_roc_train <- roc_curve(elastic_probs_train, truth = Class, `.pred_Blue Tarp`)
## Holdout predictions + bind true labels
elastic_probs_holdout <- predict(elastic_final_fit, new_data = small_holdout, type = "prob") %>%
bind_cols(Class = small_holdout$Class)
## Holdout ROC curve
elastic_roc_holdout <- roc_curve(elastic_probs_holdout, truth = Class, `.pred_Blue Tarp`)
## Plot ROC curves
elastic_roc_train_plot <- autoplot(elastic_roc_train) +
labs(title = "Elastic Net ROC - Training Set") +
theme_minimal()
elastic_roc_test_plot <- autoplot(elastic_roc_holdout) +
labs(title = "Elastic Net ROC - Holdout Set") +
theme_minimal()
## Display plots side by side
elastic_roc_train_plot | elastic_roc_test_plot

print(elastic_best_threshold)
## [1] 0.76
## Plot F-measure vs Threshold with optimal point
ggplot(elastic_threshold_perf, aes(x = .threshold, y = .estimate)) +
geom_line(color = "steelblue") +
geom_point(data = filter(elastic_threshold_perf, .threshold == elastic_best_threshold), color = "red", size = 3) +
labs(title = "Elastic Net - F-measure vs Threshold", x = "Threshold", y = "F-measure") +
theme_minimal()

Random Forest
## Random Forest spec
rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 500) %>%
set_engine("ranger") %>%
set_mode("classification")
rf_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
step_normalize(all_numeric_predictors()) %>%
step_upsample(Class)
rf_wf <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(rf_rec)
## Define tuning grid - FIXED
## Extract parameters from workflow and finalize with training data
rf_grid <- grid_regular(
mtry(range = c(1, 3)), # 3 predictors max (Red, Green, Blue)
min_n(range = c(2, 40)),
levels = 4
)
## Tune Random Forest
rf_tuned <- tune_grid(
rf_wf,
resamples = cv_folds,
grid = rf_grid,
metrics = metric_set(roc_auc),
control = control_grid(save_pred = TRUE)
)
## Plot tuning results
autoplot(rf_tuned)

## Finalize best model
best_rf <- select_best(rf_tuned, metric = "roc_auc")
final_rf_wf <- finalize_workflow(rf_wf, best_rf)
rf_final_fit <- fit(final_rf_wf, data = train_data)
## redict probabilities on training data
rf_probs_train <- predict(rf_final_fit, new_data = train_data, type = "prob") %>%
bind_cols(Class = train_data$Class)
rf_threshold_perf <- probably::threshold_perf(
rf_probs_train,
truth = Class,
estimate = `.pred_Blue Tarp`,
thresholds = seq(0.05, 0.95, by = 0.01),
metrics = metric_set(f_meas),
event_level = "first"
)
rf_best_threshold <- rf_threshold_perf %>%
arrange(desc(.estimate)) %>%
slice(1) %>%
pull(.threshold)
print(rf_best_threshold)
## [1] 0.83
## F-measure vs threshold plot
ggplot(rf_threshold_perf, aes(x = .threshold, y = .estimate)) +
geom_line(color = "steelblue") +
geom_point(data = filter(rf_threshold_perf, .threshold == rf_best_threshold), color = "red", size = 3) +
labs(title = "Random Forest - F-measure vs Threshold", x = "Threshold", y = "F-measure") +
theme_minimal()

## Predict on holdout set using best threshold
rf_probs <- predict(rf_final_fit, small_holdout, type = "prob")
rf_preds <- get_class_preds(rf_probs, small_holdout, class_name = "Blue Tarp", threshold = rf_best_threshold) %>%
mutate(
Class = factor(Class, levels = c("Blue Tarp", "Not Blue Tarp")),
pred_class = factor(pred_class, levels = levels(Class))
)
## Confusion Matrix + ROC AUC + metrics
rf_cm <- conf_mat(rf_preds, truth = Class, estimate = pred_class)
rf_metrics_holdout <- rf_preds %>%
metrics_class(truth = Class, estimate = pred_class) %>%
mutate(Model = "Random Forest")
rf_roc_auc_holdout <- rf_probs %>%
bind_cols(Class = small_holdout$Class) %>%
metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
mutate(Model = "Random Forest")
## Combine Metrics Table
rf_metrics_combined <- rf_metrics_holdout %>%
dplyr::select(.metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
bind_cols(rf_roc_auc_holdout %>% dplyr::select(roc_auc = .estimate)) %>%
mutate(Model = "Random Forest") %>%
dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)
## Confusion matrix table formatting
rf_cm_table <- as_tibble(rf_cm$table) %>%
rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
pivot_wider(names_from = Prediction, values_from = Count) %>%
rename(`Actual Class` = Truth)
## Display metrics + confusion matrix
kable(rf_metrics_combined, digits = 3, caption = "Random Forest Performance on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Random Forest Performance on Holdout Set
|
Model
|
accuracy
|
precision
|
recall
|
f_meas
|
roc_auc
|
|
Random Forest
|
0.994
|
0.616
|
0.511
|
0.559
|
0.978
|
kable(rf_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): RF on Holdout Set") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Confusion Matrix (Matrix View): RF on Holdout Set
|
Actual Class
|
Blue Tarp
|
Not Blue Tarp
|
|
Blue Tarp
|
7396
|
7084
|
|
Not Blue Tarp
|
4605
|
1985092
|
## Heatmap-style confusion matrix
rf_cm %>%
autoplot(type = "heatmap") +
labs(title = "Confusion Matrix: Random Forest on Holdout Set") +
theme_minimal()

## ROC Curves (Train vs Holdout)
rf_probs_train <- predict(rf_final_fit, new_data = train_data, type = "prob") %>%
bind_cols(Class = train_data$Class)
rf_probs_holdout <- predict(rf_final_fit, new_data = small_holdout, type = "prob") %>%
bind_cols(Class = small_holdout$Class)
rf_roc_train <- roc_curve(rf_probs_train, truth = Class, `.pred_Blue Tarp`)
rf_roc_test <- roc_curve(rf_probs_holdout, truth = Class, `.pred_Blue Tarp`)
rf_roc_train_plot <- autoplot(rf_roc_train) +
labs(title = "Random Forest ROC - Training Set") +
theme_minimal()
rf_roc_test_plot <- autoplot(rf_roc_test) +
labs(title = "Random Forest ROC - Holdout Set") +
theme_minimal()
rf_roc_train_plot | rf_roc_test_plot
