Project Final

library(tidymodels)
library(tidyverse)
library(skimr)
library(discrim)
library(patchwork)
library(probably)
library(knitr)
library(scales)
library(kknn)
library(themis)
library(future)
library(parallel)
set.seed(123)
#### Detect number of available cores
num_cores <- parallel::detectCores(logical = FALSE)  

#### Set up future plan with all cores
plan(multisession, workers = num_cores)

haiti <- read_csv("HaitiPixels.csv")

haiti <- haiti %>%
  mutate(Class = factor(Class))

Factoring in Class

haiti_bin <- haiti %>%
  mutate(Class = if_else(Class == "Blue Tarp", "Blue Tarp", "Not Blue Tarp")) %>%
  mutate(Class = factor(Class))

#### Check distribution
table(haiti_bin$Class)

## 
##     Blue Tarp Not Blue Tarp 
##          2022         61219

summary(haiti_bin)

##            Class            Red          Green            Blue      
##  Blue Tarp    : 2022   Min.   : 48   Min.   : 48.0   Min.   : 44.0  
##  Not Blue Tarp:61219   1st Qu.: 80   1st Qu.: 78.0   1st Qu.: 63.0  
##                        Median :163   Median :148.0   Median :123.0  
##                        Mean   :163   Mean   :153.7   Mean   :125.1  
##                        3rd Qu.:255   3rd Qu.:226.0   3rd Qu.:181.0  
##                        Max.   :255   Max.   :255.0   Max.   :255.0

glimpse(haiti_bin)

## Rows: 63,241
## Columns: 4
## $ Class <fct> Not Blue Tarp, Not Blue Tarp, Not Blue Tarp, Not Blue Tarp, Not …
## $ Red   <dbl> 64, 64, 64, 75, 74, 72, 71, 69, 68, 67, 66, 64, 62, 82, 82, 79, …
## $ Green <dbl> 67, 67, 66, 82, 82, 76, 72, 70, 70, 70, 69, 67, 66, 85, 83, 80, …
## $ Blue  <dbl> 50, 50, 49, 53, 54, 52, 51, 49, 49, 50, 50, 50, 50, 55, 54, 53, …

skimr::skim(haiti_bin)

Data summary
Name	haiti_bin
Number of rows	63241
Number of columns	4
_______________________
Column type frequency:
factor	1
numeric	3
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Class	0	1	FALSE	2	Not: 61219, Blu: 2022

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Red	1	162.98	78.88	48	80	163	255	255	▆▂▂▂▇
Green	1	153.66	72.64	48	78	148	226	255	▇▃▂▃▇
Blue	1	125.14	61.39	44	63	123	181	255	▇▂▃▆▁

colSums(is.na(haiti_bin))

## Class   Red Green  Blue 
##     0     0     0     0

GGally::ggpairs(haiti, columns = c("Red", "Green", "Blue"))

haiti %>%
  pivot_longer(cols = c(Red, Green, Blue)) %>%
  ggplot(aes(x = value, fill = Class)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~name, scales = "free") +
  theme_minimal()

haiti %>%
  pivot_longer(cols = c(Red, Green, Blue)) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  facet_wrap(~name, scales = "free") +
  theme_minimal()

cor(haiti_bin %>% select(Red, Green, Blue))

##             Red     Green      Blue
## Red   1.0000000 0.9815987 0.9355294
## Green 0.9815987 1.0000000 0.9648024
## Blue  0.9355294 0.9648024 1.0000000

reading and combining holdouts into a single dataframe

function to extract data from single file

read_holdout_file <- function(file_path) {
  #### Read all lines
  lines <- readLines(file_path)
  #### Remove all metadata lines starting with ';'
  data_lines <- lines[!grepl("^;", lines)]
  #### Write remaining lines to temp file
  temp_file <- tempfile()
  writeLines(data_lines, temp_file)
  #### Read table without header (because first line is data, not header)
  df <- suppressMessages(readr::read_table(temp_file, col_names = FALSE))
  unlink(temp_file)
  #### Assign column names assuming last 3 columns are RGB (or B1, B2, B3)
  rgb_cols_indices <- (ncol(df) - 2):ncol(df)
  #### Rename last 3 columns to Red, Green, Blue
  colnames(df)[rgb_cols_indices] <- c("Red", "Green", "Blue")
  #### Select last 3 columns
  df_rgb <- df %>% dplyr::select(Red, Green, Blue)
  #### Assign class label
  class_label <- ifelse(grepl("Blue_Tarps", file_path), "Blue Tarp", "Not Blue Tarp")
  df_rgb %>% mutate(Class = class_label)
}

Test if it works and compare by opening data in notepad

df_test <- read_holdout_file("orthovnir078_ROI_NON_Blue_Tarps.txt")
head(df_test)

#### Reading in holdout, looks like blue_067 has a metadata including and just plain data versions

#### Read Blue Tarp files
#### blue_067 <- read_holdout_file("orthovnir067_ROI_Blue_Tarps.txt") %>%
 #### mutate(Class = "Blue Tarp")


blue_067data <- read_holdout_file("orthovnir067_ROI_Blue_Tarps_data.txt") %>%
  mutate(Class = "Blue Tarp")


blue_069 <- read_holdout_file("orthovnir069_ROI_Blue_Tarps.txt") %>%
  mutate(Class = "Blue Tarp")

blue_078 <- read_holdout_file("orthovnir078_ROI_Blue_Tarps.txt") %>%
  mutate(Class = "Blue Tarp")


#### Read Not Blue Tarp files
not_057 <- read_holdout_file("orthovnir057_ROI_NON_Blue_Tarps.txt") %>%
  mutate(Class = "Not Blue Tarp")

not_067 <- read_holdout_file("orthovnir067_ROI_NOT_Blue_Tarps.txt") %>%
  mutate(Class = "Not Blue Tarp")

not_069 <- read_holdout_file("orthovnir069_ROI_NOT_Blue_Tarps.txt") %>%
  mutate(Class = "Not Blue Tarp")

not_078 <- read_holdout_file("orthovnir078_ROI_NON_Blue_Tarps.txt") %>%
  mutate(Class = "Not Blue Tarp")

#### Combine all data frames into one
holdout_combined <- bind_rows(
  blue_067data, blue_069, blue_078,
  not_057, not_067, not_069, not_078
)

holdout_combined <- holdout_combined %>%
  mutate(Class = factor(Class))

Saving to output

write_csv(holdout_combined, "Holdout_Pixels_Combined.csv")
write_csv(haiti_bin, "train_data.csv")

skimr::skim(holdout_combined)

Data summary
Name	holdout_combined
Number of rows	2004177
Number of columns	4
_______________________
Column type frequency:
factor	1
numeric	3
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Class	0	1	FALSE	2	Not: 1989697, Blu: 14480

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Red	1	118.26	56.87	27	76	107	139	255	▅▇▅▂▂
Green	1	105.38	52.72	28	71	91	117	255	▅▇▂▁▁
Blue	1	82.36	46.65	25	55	66	88	255	▇▃▁▁▁

Check if Blue is dominant and we imported data correctly

#### Check: is Blue the dominant color?
holdout_combined1 <- holdout_combined %>%
  mutate(
    is_blue_dominant = (Blue > Red) & (Blue > Green)
  )

#### Accuracy for Blue Tarp
table(holdout_combined1$Class, holdout_combined1$is_blue_dominant)

##                
##                   FALSE    TRUE
##   Blue Tarp         364   14116
##   Not Blue Tarp 1978739   10958

#### You can also compute % correct for Blue Tarps
blue_tarp_accuracy <- holdout_combined1 %>%
  filter(Class == "Blue Tarp") %>%
  summarize(
    total = n(),
    blue_dominant = sum(is_blue_dominant),
    pct_blue_dominant = blue_dominant / total
  )

blue_tarp_accuracy

Vis

train_data <- haiti_bin
train_data %>%
  pivot_longer(cols = c(Red, Green, Blue), names_to = "Channel", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Class)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~Channel, scales = "free") +
  theme_minimal() +
  labs(title = "Density of RGB Channels by Class")

haiti %>% 
  pivot_longer(cols = c(Red, Green, Blue),
               names_to = "Channel", values_to = "Value") %>% 
  ggplot(aes(x = Class, y = Value, fill = Class)) +
  geom_boxplot(outlier.shape = 21, outlier.size = 1.2) +
  facet_wrap(~Channel, scales = "free") +
  labs(title = "Boxplots of RGB Channels by Class",
       x = "Roof Class",
       y = "Pixel Intensity") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45,               #### <‑‑ rotate
                               vjust = 1, hjust = 1),    #### tweak alignment
    legend.position = "none"
  )

Part 2 & 3

train_data <- read_csv("train_data.csv")
holdout_combined <- read.csv('Holdout_Pixels_Combined.csv')
train_data<-train_data%>%
  mutate(Class = factor(Class))
holdout_combined <-  holdout_combined %>%
  mutate(Class = factor(Class))

Model Training

Setup CV Folds and Data

set.seed(43)
##small_holdout <- holdout_combined %>% sample_n(100000) ## Less Data
small_holdout<-holdout_combined ## Full Data
## Define CV folds on training data
cv_folds <- vfold_cv(train_data, v = 5, strata = Class)
metrics_class <- metric_set(accuracy, recall, precision, f_meas)
metrics_prob <- metric_set(roc_auc)
## Helper function to get class predictions with threshold
get_class_preds <- function(pred_probs, holdout_df, 
                            class_name = "Blue Tarp", threshold = 0.5) {
  pred_col <- paste0(".pred_", class_name)  ## keep spaces as is
  
  bind_cols(pred_probs, holdout_df) %>%
    mutate(pred_class = if_else(.data[[pred_col]] > threshold, 
                                class_name, paste("Not", class_name)))
}

Logistic

log_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
  step_upsample(Class)
#### Model Building / Cross Val
log_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

log_wf <- workflow() %>%
  add_model(log_spec) %>%
  add_recipe(log_rec)

log_fit_cv <- log_wf %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = metric_set(accuracy, roc_auc),
    control = control_resamples(save_pred = TRUE)
  )
log_metrics <- collect_metrics(log_fit_cv) %>%
  mutate(model = "Logistic Regression")

log_metrics %>% select(.metric, mean)

## Fit
log_final_fit <- fit(log_wf, data = train_data)
## Predict probabilities and class on holdout
log_probs <- predict(log_final_fit, small_holdout, type = "prob")
log_preds <- get_class_preds(log_probs, small_holdout, class_name = "Blue Tarp", 
                             threshold = 0.5) %>%
  mutate(
    Class = factor(Class),
    pred_class = factor(pred_class, levels = levels(Class))
  )
log_metrics_holdout <- log_preds %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "Logistic Regression")

log_roc_auc_holdout <- log_probs %>%
  bind_cols(Class = small_holdout$Class) %>%
  metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "Logistic Regression")


## Combine metrics from threshold-based metrics and ROC AUC
log_metrics_combined <- log_metrics_holdout %>%
  select(.metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  bind_cols(log_roc_auc_holdout %>% select(roc_auc = .estimate)) %>%
  mutate(Model = "Logistic Regression") %>%
  select(Model, accuracy, precision, recall, f_meas, roc_auc)

log_cm <- conf_mat(log_preds, truth = Class, estimate = pred_class)

log_cm %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix: Logistic Regression on Holdout Set") +
  theme_minimal()

## Extract confusion matrix as a data frame
log_cm_table <- as_tibble(log_cm$table)
log_cm_table <- log_cm_table %>%
  rename(
    Truth = Truth,
    Prediction = Prediction,
    Count = n
  )
## Display Heatmap Matrix
log_cm_matrix <- log_cm_table %>%
  pivot_wider(names_from = Prediction, values_from = Count) %>%
  rename(`Actual Class` = Truth)

## Tabular Matrix + Performance
kable(log_metrics_combined, digits = 3, 
      caption = "Logistic Regression Performance on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = 
                              c("striped", "hover", "condensed"))

Logistic Regression Performance on Holdout Set
Model	accuracy	precision	recall	f_meas	roc_auc
Logistic Regression	0.914	0.077	1	0.144	1

kable(log_cm_matrix, digits = 0, 
      caption = 
        "Confusion Matrix (Matrix View): Logistic Regression on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = 
                              c("striped", "hover", "condensed"))

Confusion Matrix (Matrix View): Logistic Regression on Holdout Set
Actual Class	Blue Tarp	Not Blue Tarp
Blue Tarp	14480	0
Not Blue Tarp	172760	1816937

QDA

qda_spec <- discrim_quad() %>%
  set_engine("MASS") %>%
  set_mode("classification")

#### Recipe with upsampling
qda_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
  step_upsample(Class)


## WF
qda_wf <- workflow() %>%
  add_model(qda_spec) %>%
  add_recipe(qda_rec)


## CV
qda_fit_cv <- qda_wf %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = metric_set(accuracy, roc_auc),
    control = control_resamples(save_pred = TRUE)
  )

## metrics
qda_metrics_cv <- collect_metrics(qda_fit_cv) %>%
  mutate(Model = "QDA")

## show mean metrics
qda_metrics_cv %>%
  dplyr::select(.metric, mean)

##  Final fit on full training data
qda_final_fit <- fit(qda_wf, data = train_data)

##  Threshold tuning on training data
qda_probs_train <- predict(qda_final_fit, 
                           new_data = train_data, type = "prob") %>%
  bind_cols(Class = train_data$Class)

qda_threshold_perf <- probably::threshold_perf(
  qda_probs_train,
  truth = Class,
  estimate = `.pred_Blue Tarp`,
  thresholds = seq(0.05, 0.95, by = 0.01),
  metrics = metric_set(f_meas),
  event_level = "first"
)

qda_best_threshold <- qda_threshold_perf %>%
  arrange(desc(.estimate)) %>%
  slice(1) %>%
  pull(.threshold)


## Predict on holdout set
qda_probs_holdout <- predict(qda_final_fit, small_holdout, type = "prob")
qda_preds_holdout <- get_class_preds(qda_probs_holdout, small_holdout, class_name = "Blue Tarp", threshold = qda_best_threshold) %>%
  mutate(
    Class = factor(Class),
    pred_class = factor(pred_class, levels = levels(Class))
  )

## Calculate holdout metrics
qda_metrics_holdout <- qda_preds_holdout %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "QDA")

qda_roc_auc_holdout <- qda_probs_holdout %>%
  bind_cols(Class = small_holdout$Class) %>%
  metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "QDA")

## Combine holdout metrics and roc_auc
qda_metrics_combined <- qda_metrics_holdout %>%
  dplyr::select(.metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  bind_cols(qda_roc_auc_holdout %>% 
              dplyr::select(roc_auc = .estimate)) %>%
  mutate(Model = "QDA") %>%
  dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)

## 10. Confusion matrix on holdout
qda_cm <- conf_mat(qda_preds_holdout, truth = Class, estimate = pred_class)

## Matrix heatmap
qda_cm %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix: QDA on Holdout Set") +
  theme_minimal()

qda_cm_table <- as_tibble(qda_cm$table) %>%
  rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
  pivot_wider(names_from = Prediction, values_from = Count) %>%
  rename(`Actual Class` = Truth)

## Display tables (metrics and confusion matrix)

kable(qda_metrics_combined, digits = 3, caption = "QDA Performance on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

QDA Performance on Holdout Set
Model	accuracy	precision	recall	f_meas	roc_auc
QDA	0.995	0.626	0.758	0.686	0.991

kable(qda_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): QDA on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Confusion Matrix (Matrix View): QDA on Holdout Set
Actual Class	Blue Tarp	Not Blue Tarp
Blue Tarp	10974	3506
Not Blue Tarp	6550	1983147

## Ensure training and holdout predictions include truth column
qda_probs_train <- predict(qda_final_fit, new_data = train_data, type = "prob") %>%
  bind_cols(Class = train_data$Class)

qda_probs_holdout <- predict(qda_final_fit, new_data = small_holdout, type = "prob") %>%
  bind_cols(Class = small_holdout$Class)

## ROC curve data
qda_roc_train <- roc_curve(qda_probs_train, truth = Class, `.pred_Blue Tarp`)
qda_roc_test <- roc_curve(qda_probs_holdout, truth = Class, `.pred_Blue Tarp`)

## Plot ROC curves
qda_roc_train_plot <- autoplot(qda_roc_train) +
  labs(title = "QDA ROC - Training Set") +
  theme_minimal()

qda_roc_test_plot <- autoplot(qda_roc_test) +
  labs(title = "QDA ROC - Holdout Set") +
  theme_minimal()

## Display plots side by side
qda_roc_train_plot | qda_roc_test_plot

qda_fmeasure_plot <- ggplot(qda_threshold_perf, aes(x = .threshold, y = .estimate)) +
  geom_line(color = "steelblue") +
  geom_point(data = filter(qda_threshold_perf, .threshold == qda_best_threshold), color = "red", size = 3) +
  labs(title = "QDA - F-measure vs Threshold", x = "Threshold", y = "F-measure") +
  theme_minimal()
print(qda_best_threshold)

## [1] 0.9

qda_fmeasure_plot

## KNN 

knn_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
  step_normalize(all_numeric_predictors())%>%
  step_upsample(Class)
knn_wf <- workflow() %>%
  add_model(
    nearest_neighbor(neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("classification")
  ) %>%
  add_recipe(knn_rec)



## Define search space for neighbors
## Define search space for neighbors
knn_params <- parameters(neighbors(range = c(1, 100)))


## Tune with Bayesian optimization
knn_tuned <- tune_bayes(
  knn_wf,
  resamples = cv_folds,
  param_info = knn_params,
  metrics = metric_set(roc_auc),
  iter = 25,
  control = control_bayes(no_improve = 5, verbose = TRUE)
)

## Plot tuning results
autoplot(knn_tuned)

## Select best model by ROC AUC
best_knn <- select_best(knn_tuned, metric = "roc_auc")

## Finalize workflow with best number of neighbors
final_knn_wf <- finalize_workflow(knn_wf, best_knn)

## Refit on full training data
knn_final_fit <- fit(final_knn_wf, data = train_data)

## Predict probabilities and classes on holdout
knn_probs <- predict(knn_final_fit, small_holdout, type = "prob")

thresholds <- tibble(threshold = seq(0.01, 0.99, by = 0.01))

threshold_eval <- thresholds %>%
  mutate(
    metrics = map(threshold, ~{
      get_class_preds(knn_probs, small_holdout, class_name = "Blue Tarp", threshold = .x) %>%
        mutate(
          Class = factor(small_holdout$Class, levels = c("Blue Tarp", "Not Blue Tarp")),
          pred_class = factor(pred_class, levels = c("Blue Tarp", "Not Blue Tarp"))
        ) %>%
        metrics_class(truth = Class, estimate = pred_class)
    })
  ) %>%
  unnest(metrics)
best_threshold <- threshold_eval %>%
  filter(.metric == "f_meas") %>%
  arrange(desc(.estimate)) %>%
  slice(1) %>%
  pull(threshold)

knn_preds <- get_class_preds(knn_probs, small_holdout, class_name = "Blue Tarp", threshold = best_threshold) %>%
  mutate(
    Class = factor(Class, levels = c("Blue Tarp", "Not Blue Tarp")),
    pred_class = factor(pred_class, levels = levels(Class))
  )

## Confusion Matrix
knn_cm <- conf_mat(knn_preds, truth = Class, estimate = pred_class)

## Metrics
knn_metrics_holdout <- knn_preds %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "KNN")

knn_roc_auc_holdout <- knn_probs %>%
  bind_cols(Class = small_holdout$Class) %>%
  metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "KNN")
print(best_threshold)

## [1] 0.98

## F-measure vs threshold plot
ggplot(threshold_eval %>% filter(.metric == "f_meas"), 
       aes(x = threshold, y = .estimate)) +
  geom_line(color = "steelblue") +
  geom_point(data = filter(threshold_eval, .metric == "f_meas", threshold == best_threshold), 
             color = "red", size = 3) +
  labs(title = "KNN - F-measure vs Threshold", 
       x = "Threshold", y = "F-measure") +
  theme_minimal()

## Combine metrics
knn_metrics_combined <- knn_metrics_holdout %>%
  dplyr::select(.metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  bind_cols(knn_roc_auc_holdout %>% dplyr::select(roc_auc = .estimate)) %>%
  mutate(Model = "KNN") %>%
  dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)
## Matrix heatmap
knn_cm %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix: KNN on Holdout Set") +
  theme_minimal()

## Display Confusion Matrix Table
knn_cm_table <- as_tibble(knn_cm$table) %>%
  rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
  pivot_wider(names_from = Prediction, values_from = Count) %>%
  rename(`Actual Class` = Truth)

## Display Tables
kable(knn_metrics_combined, digits = 3, caption = "KNN Performance on Holdout Set (Tuned)") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

KNN Performance on Holdout Set (Tuned)
Model	accuracy	precision	recall	f_meas	roc_auc
KNN	0.983	0.274	0.782	0.406	0.945

kable(knn_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): KNN on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Confusion Matrix (Matrix View): KNN on Holdout Set
Actual Class	Blue Tarp	Not Blue Tarp
Blue Tarp	11322	3158
Not Blue Tarp	29982	1959715

Elastic Net

elastic_spec <- logistic_reg(
  penalty = tune(),    ## tunable
  mixture = tune()     ## tunable
) %>%
  set_engine("glmnet") %>%
  set_mode("classification")

## Upsample Recipe
elastic_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
  step_upsample(Class)

## WF
elastic_wf <- workflow() %>%
  add_model(elastic_spec) %>%
  add_recipe(elastic_rec)

## Grid for tuning
elastic_grid <- grid_regular(
  mixture(range = c(0, 1)),
  penalty(range = c(-4, 0)),  ## log10 scale: 10^-4 to 1
  levels = 5
)

## CV

elastic_tuned <- tune_grid(
  elastic_wf,
  resamples = cv_folds,
  grid = elastic_grid,
  metrics = metric_set(roc_auc),
  control = control_grid(save_pred = TRUE)
)

## best
best_params <- select_best(elastic_tuned, metric = "roc_auc")

## 7. Finalize workflow 
final_elastic_wf <- finalize_workflow(elastic_wf, best_params)
elastic_final_fit <- fit(final_elastic_wf, data = train_data)

## 8. Predict probabilities on training data for threshold tuning
elastic_probs_train <- predict(elastic_final_fit, new_data = train_data, type = "prob") %>%
  bind_cols(Class = train_data$Class)

## 9. Threshold tuning based on F-measure (like QDA)
elastic_threshold_perf <- probably::threshold_perf(
  elastic_probs_train,
  truth = Class,
  estimate = `.pred_Blue Tarp`,
  thresholds = seq(0.05, 0.95, by = 0.01),
  metrics = metric_set(f_meas),
  event_level = "first"
)

elastic_best_threshold <- elastic_threshold_perf %>%
  arrange(desc(.estimate)) %>%
  slice(1) %>%
  pull(.threshold)

## Predict on holdout with tuned threshold
elastic_probs_holdout <- predict(elastic_final_fit, small_holdout, type = "prob")
elastic_preds_holdout <- get_class_preds(elastic_probs_holdout, small_holdout,
                                        class_name = "Blue Tarp",
                                        threshold = elastic_best_threshold) %>%
  mutate(
    Class = factor(Class),
    pred_class = factor(pred_class, levels = levels(Class))
  )

## Calculate holdout classification metrics
elastic_metrics_holdout <- elastic_preds_holdout %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "Elastic Net")

## Calculate ROC AUC on holdout
elastic_roc_auc_holdout <- elastic_probs_holdout %>%
  bind_cols(Class = small_holdout$Class) %>%
  metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "Elastic Net")

## Combine holdout metrics and roc_auc
elastic_metrics_combined <- elastic_metrics_holdout %>%
  dplyr::select(.metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  bind_cols(elastic_roc_auc_holdout %>% dplyr::select(roc_auc = .estimate)) %>%
  mutate(Model = "Elastic Net") %>%
  dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)

## Confusion matrix on holdout
elastic_cm <- conf_mat(elastic_preds_holdout, truth = Class, estimate = pred_class)

## Plot confusion matrix heatmap
elastic_cm %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix: Elastic Net on Holdout Set") +
  theme_minimal()

## Confusion matrix table for display
elastic_cm_table <- as_tibble(elastic_cm$table) %>%
  rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
  pivot_wider(names_from = Prediction, values_from = Count) %>%
  rename(`Actual Class` = Truth)

## Display tables (metrics and confusion matrix)
kable(elastic_metrics_combined, digits = 3, caption = "Elastic Net Performance on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Elastic Net Performance on Holdout Set
Model	accuracy	precision	recall	f_meas	roc_auc
Elastic Net	0.958	0.146	1	0.254	1

kable(elastic_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): Elastic Net on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Confusion Matrix (Matrix View): Elastic Net on Holdout Set
Actual Class	Blue Tarp	Not Blue Tarp
Blue Tarp	14474	6
Not Blue Tarp	84874	1904823

##Plot ROC curves (Train vs Holdout)

## Training ROC curve
elastic_roc_train <- roc_curve(elastic_probs_train, truth = Class, `.pred_Blue Tarp`)

## Holdout predictions + bind true labels
elastic_probs_holdout <- predict(elastic_final_fit, new_data = small_holdout, type = "prob") %>%
  bind_cols(Class = small_holdout$Class)

## Holdout ROC curve
elastic_roc_holdout <- roc_curve(elastic_probs_holdout, truth = Class, `.pred_Blue Tarp`)

## Plot ROC curves
elastic_roc_train_plot <- autoplot(elastic_roc_train) +
  labs(title = "Elastic Net ROC - Training Set") +
  theme_minimal()

elastic_roc_test_plot <- autoplot(elastic_roc_holdout) +
  labs(title = "Elastic Net ROC - Holdout Set") +
  theme_minimal()

## Display plots side by side
elastic_roc_train_plot | elastic_roc_test_plot

print(elastic_best_threshold)

## [1] 0.76

## Plot F-measure vs Threshold with optimal point
ggplot(elastic_threshold_perf, aes(x = .threshold, y = .estimate)) +
  geom_line(color = "steelblue") +
  geom_point(data = filter(elastic_threshold_perf, .threshold == elastic_best_threshold), color = "red", size = 3) +
  labs(title = "Elastic Net - F-measure vs Threshold", x = "Threshold", y = "F-measure") +
  theme_minimal()

Random Forest

## Random Forest spec 
rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 500) %>%
  set_engine("ranger") %>%
  set_mode("classification")

rf_rec <- recipe(Class ~ Red + Green + Blue, data = train_data) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_upsample(Class)

rf_wf <- workflow() %>%
  add_model(rf_spec) %>%
  add_recipe(rf_rec)

## Define tuning grid - FIXED
## Extract parameters from workflow and finalize with training data
rf_grid <- grid_regular(
  mtry(range = c(1, 3)),  # 3 predictors max (Red, Green, Blue)
  min_n(range = c(2, 40)),
  levels = 4
)

## Tune Random Forest
rf_tuned <- tune_grid(
  rf_wf,
  resamples = cv_folds,
  grid = rf_grid,
  metrics = metric_set(roc_auc),
  control = control_grid(save_pred = TRUE)
)

## Plot tuning results
autoplot(rf_tuned)

## Finalize best model 
best_rf <- select_best(rf_tuned, metric = "roc_auc")
final_rf_wf <- finalize_workflow(rf_wf, best_rf)

rf_final_fit <- fit(final_rf_wf, data = train_data)

## redict probabilities on training data
rf_probs_train <- predict(rf_final_fit, new_data = train_data, type = "prob") %>%
  bind_cols(Class = train_data$Class)

rf_threshold_perf <- probably::threshold_perf(
  rf_probs_train,
  truth = Class,
  estimate = `.pred_Blue Tarp`,
  thresholds = seq(0.05, 0.95, by = 0.01),
  metrics = metric_set(f_meas),
  event_level = "first"
)

rf_best_threshold <- rf_threshold_perf %>%
  arrange(desc(.estimate)) %>%
  slice(1) %>%
  pull(.threshold)

print(rf_best_threshold)

## [1] 0.83

## F-measure vs threshold plot
ggplot(rf_threshold_perf, aes(x = .threshold, y = .estimate)) +
  geom_line(color = "steelblue") +
  geom_point(data = filter(rf_threshold_perf, .threshold == rf_best_threshold), color = "red", size = 3) +
  labs(title = "Random Forest - F-measure vs Threshold", x = "Threshold", y = "F-measure") +
  theme_minimal()

## Predict on holdout set using best threshold
rf_probs <- predict(rf_final_fit, small_holdout, type = "prob")

rf_preds <- get_class_preds(rf_probs, small_holdout, class_name = "Blue Tarp", threshold = rf_best_threshold) %>%
  mutate(
    Class = factor(Class, levels = c("Blue Tarp", "Not Blue Tarp")),
    pred_class = factor(pred_class, levels = levels(Class))
  )

## Confusion Matrix + ROC AUC + metrics
rf_cm <- conf_mat(rf_preds, truth = Class, estimate = pred_class)

rf_metrics_holdout <- rf_preds %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "Random Forest")

rf_roc_auc_holdout <- rf_probs %>%
  bind_cols(Class = small_holdout$Class) %>%
  metrics_prob(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "Random Forest")

## Combine Metrics Table
rf_metrics_combined <- rf_metrics_holdout %>%
  dplyr::select(.metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  bind_cols(rf_roc_auc_holdout %>% dplyr::select(roc_auc = .estimate)) %>%
  mutate(Model = "Random Forest") %>%
  dplyr::select(Model, accuracy, precision, recall, f_meas, roc_auc)

## Confusion matrix table formatting
rf_cm_table <- as_tibble(rf_cm$table) %>%
  rename(Truth = Truth, Prediction = Prediction, Count = n) %>%
  pivot_wider(names_from = Prediction, values_from = Count) %>%
  rename(`Actual Class` = Truth)

## Display metrics + confusion matrix
kable(rf_metrics_combined, digits = 3, caption = "Random Forest Performance on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Random Forest Performance on Holdout Set
Model	accuracy	precision	recall	f_meas	roc_auc
Random Forest	0.994	0.616	0.511	0.559	0.978

kable(rf_cm_table, digits = 0, caption = "Confusion Matrix (Matrix View): RF on Holdout Set") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Confusion Matrix (Matrix View): RF on Holdout Set
Actual Class	Blue Tarp	Not Blue Tarp
Blue Tarp	7396	7084
Not Blue Tarp	4605	1985092

## Heatmap-style confusion matrix
rf_cm %>%
  autoplot(type = "heatmap") +
  labs(title = "Confusion Matrix: Random Forest on Holdout Set") +
  theme_minimal()

## ROC Curves (Train vs Holdout)
rf_probs_train <- predict(rf_final_fit, new_data = train_data, type = "prob") %>%
  bind_cols(Class = train_data$Class)

rf_probs_holdout <- predict(rf_final_fit, new_data = small_holdout, type = "prob") %>%
  bind_cols(Class = small_holdout$Class)

rf_roc_train <- roc_curve(rf_probs_train, truth = Class, `.pred_Blue Tarp`)
rf_roc_test <- roc_curve(rf_probs_holdout, truth = Class, `.pred_Blue Tarp`)

rf_roc_train_plot <- autoplot(rf_roc_train) +
  labs(title = "Random Forest ROC - Training Set") +
  theme_minimal()

rf_roc_test_plot <- autoplot(rf_roc_test) +
  labs(title = "Random Forest ROC - Holdout Set") +
  theme_minimal()

rf_roc_train_plot | rf_roc_test_plot

Metrics

log_roc_auc <- log_probs %>%
  bind_cols(Class = small_holdout$Class) %>%
  roc_auc(truth = Class, `.pred_Blue Tarp`) %>%  
  mutate(Model = "Logistic Regression")


qda_roc_auc <- qda_probs_holdout %>%
  mutate(Class = small_holdout$Class) %>%
  dplyr::select(`.pred_Blue Tarp`, `.pred_Not Blue Tarp`, Class) %>%
  roc_auc(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "QDA")

elastic_roc_auc <- elastic_probs_holdout %>%
  mutate(Class = small_holdout$Class) %>%       
  dplyr::select(`.pred_Blue Tarp`, `.pred_Not Blue Tarp`, Class) %>%  
  roc_auc(truth = Class, `.pred_Blue Tarp`) %>%  ## compute roc_auc
  mutate(Model = "Elastic Net")


rf_roc_auc <- rf_probs %>%
  bind_cols(Class = small_holdout$Class) %>%
  roc_auc(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "Random Forest")

knn_roc_auc <- knn_probs %>%
  bind_cols(Class = small_holdout$Class) %>%
  roc_auc(truth = Class, `.pred_Blue Tarp`) %>%
  mutate(Model = "KNN")

all_roc_auc <- bind_rows(
  log_roc_auc,
  qda_roc_auc, 
  knn_roc_auc, 
  elastic_roc_auc, 
  rf_roc_auc)

## Classification metrics (accuracy, recall, etc.)
log_metrics <- log_preds %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "Logistic Regression")

qda_metrics <- qda_preds_holdout %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "QDA")

elastic_metrics <- elastic_preds_holdout %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "Elastic Net")

rf_metrics <- rf_preds %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "Random Forest")

knn_metrics <- knn_preds %>%
  metrics_class(truth = Class, estimate = pred_class) %>%
  mutate(Model = "KNN")

all_metrics <- bind_rows(
  log_metrics,
  qda_metrics,
  knn_metrics,
  elastic_metrics,
  rf_metrics
)

## Pivot to wide format
metrics_table <- all_metrics %>%
  dplyr::select(Model, .metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate)

## Join ROC AUC with other metrics
metrics_table <- metrics_table %>%
  left_join(all_roc_auc %>% dplyr::select(Model, .estimate) %>% rename(roc_auc = .estimate), by = "Model")

## Display table
kable(metrics_table, digits = 2, caption = "Model Performance Metrics on Holdout Set (Including ROC AUC)")

Model Performance Metrics on Holdout Set (Including ROC AUC)
Model	accuracy	recall	precision	f_meas	roc_auc
Logistic Regression	0.91	1.00	0.08	0.14	1.00
QDA	0.99	0.76	0.63	0.69	0.99
KNN	0.98	0.78	0.27	0.41	0.95
Elastic Net	0.96	1.00	0.15	0.25	1.00
Random Forest	0.99	0.51	0.62	0.56	0.98

Plot

## Function to compute ROC data
get_roc_df <- function(pred_probs, model_name) {
  pred_probs %>%
    ## Remove any Class-like columns in pred_probs that may conflict
    dplyr::select(-starts_with("Class")) %>%
    ## Add Class column from small_holdout explicitly
    mutate(Class = small_holdout$Class) %>%
    roc_curve(truth = Class, `.pred_Blue Tarp`) %>%
    mutate(Model = model_name)
}

## Compute ROC curves for all models
log_roc     <- get_roc_df(log_probs, "Logistic Regression")
qda_roc     <- get_roc_df(qda_probs_holdout , "QDA")
knn_roc     <- get_roc_df(knn_probs, "KNN")
elastic_roc <- get_roc_df(elastic_probs_holdout, "Elastic Net") 
rf_roc      <- get_roc_df(rf_probs, "Random Forest")

## Combine ROC data
all_roc <- bind_rows(
  log_roc, 
  qda_roc,
  knn_roc, 
  elastic_roc, 
  rf_roc
  )

## Plot ROC curves
Plotss<-ggplot(all_roc, aes(x = 1 - specificity, y = sensitivity, color = Model)) +
  geom_line(size = 1) +
  geom_abline(linetype = "dashed", color = "gray") +
  coord_equal() +
  theme_minimal(base_size = 14) +
  labs(
    title = "ROC Curves for All Models (Holdout Data)",
    x = "1 - Specificity (False Positive Rate)",
    y = "Sensitivity (True Positive Rate)",
    color = "Model"
  ) +
  theme(legend.position = "bottom") +
  scale_color_brewer(palette = "Dark2")
Plotss