set.seed(123)
<- nrow(data_diabetes)
n <- sample(seq_len(n), size = 0.2 * n)
test_index <- data_diabetes[test_index, ]
data_test <- data_diabetes[-test_index, ] data_train
2 Demo: a predictive modelling case study (base R approach)
Let’s use base R and selected packages to build a predictive model for BMI using our diabetes
data set.
2.1 Load Data and Perform EDA
library(tidyverse)
library(ggcorrplot)
library(glmnet)
library(fastDummies)
# Load the data
<- read_csv("data/data-diabetes.csv")
input_diabetes
# Create BMI variable
<- input_diabetes %>%
data_diabetes mutate(BMI = round(weight / height^2 * 703, 2)) %>%
relocate(BMI, .after = id)
# preview data
glimpse(data_diabetes)
## Rows: 403
## Columns: 20
## $ id <dbl> 1000, 1001, 1002, 1003, 1005, 1008, 1011, 1015, 1016, 1022, 1…
## $ BMI <dbl> 22.13, 37.42, 48.37, 18.64, 27.82, 26.50, 28.20, 34.33, 24.51…
## $ chol <dbl> 203, 165, 228, 78, 249, 248, 195, 227, 177, 263, 242, 215, 23…
## $ stab.glu <dbl> 82, 97, 92, 93, 90, 94, 92, 75, 87, 89, 82, 128, 75, 79, 76, …
## $ hdl <dbl> 56, 24, 37, 12, 28, 69, 41, 44, 49, 40, 54, 34, 36, 46, 30, 4…
## $ ratio <dbl> 3.6, 6.9, 6.2, 6.5, 8.9, 3.6, 4.8, 5.2, 3.6, 6.6, 4.5, 6.3, 6…
## $ glyhb <dbl> 4.31, 4.44, 4.64, 4.63, 7.72, 4.81, 4.84, 3.94, 4.84, 5.78, 4…
## $ location <chr> "Buckingham", "Buckingham", "Buckingham", "Buckingham", "Buck…
## $ age <dbl> 46, 29, 58, 67, 64, 34, 30, 37, 45, 55, 60, 38, 27, 40, 36, 3…
## $ gender <chr> "female", "female", "female", "male", "male", "male", "male",…
## $ height <dbl> 62, 64, 61, 67, 68, 71, 69, 59, 69, 63, 65, 58, 60, 59, 69, 6…
## $ weight <dbl> 121, 218, 256, 119, 183, 190, 191, 170, 166, 202, 156, 195, 1…
## $ frame <chr> "medium", "large", "large", "large", "medium", "large", "medi…
## $ bp.1s <dbl> 118, 112, 190, 110, 138, 132, 161, NA, 160, 108, 130, 102, 13…
## $ bp.1d <dbl> 59, 68, 92, 50, 80, 86, 112, NA, 80, 72, 90, 68, 80, NA, 66, …
## $ bp.2s <dbl> NA, NA, 185, NA, NA, NA, 161, NA, 128, NA, 130, NA, NA, NA, N…
## $ bp.2d <dbl> NA, NA, 92, NA, NA, NA, 112, NA, 86, NA, 90, NA, NA, NA, NA, …
## $ waist <dbl> 29, 46, 49, 33, 44, 36, 46, 34, 34, 45, 39, 42, 35, 37, 36, 3…
## $ hip <dbl> 38, 48, 57, 38, 41, 42, 49, 39, 40, 50, 45, 50, 41, 43, 40, 4…
## $ time.ppn <dbl> 720, 360, 180, 480, 300, 195, 720, 1020, 300, 240, 300, 90, 7…
# run basic EDA
# note: we have seen descriptive statistics and plots during EDA session
# note: so here we only look at missing data and correlation
# calculate number of missing data per variable
<- data_diabetes %>%
data_na summarise(across(everything(), ~ sum(is.na(.))))
# make a table with counts sorted from highest to lowest
<- data_na %>%
data_na_long pivot_longer(-id, names_to = "variable", values_to = "count") %>%
arrange(desc(count))
# make a column plot to visualize the counts
%>%
data_na_long ggplot(aes(x = variable, y = count)) +
geom_col(fill = "blue4") +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
# Based on the number of missing data, let's delete bp.2s, bp.2d
# and use complete-cases analysis
<- data_diabetes %>%
data_diabetes ::select(-bp.2s, -bp.2d) %>%
dplyrna.omit()
# Correlation heatmap
<- data_diabetes %>%
data_cor select(where(is.numeric), -id) %>%
cor()
ggcorrplot(data_cor, hc.order = TRUE, lab = FALSE)
2.2 Split Data
2.3 Feature Engineering and Scaling
# Conversion factors
<- 2.54 / 100
inch2m <- 0.45
pound2kg
# ---- Process Training Data ----
<- data_train %>%
data_train_processed mutate(
height = round(height * inch2m, 2),
weight = round(weight * pound2kg, 2),
glu = log(stab.glu)
%>%
) select(-stab.glu, -id)
# Remove zero-variance features
<- sapply(data_train_processed, function(x) length(unique(x)) > 1)
nzv <- data_train_processed[, nzv]
data_train_processed
# Remove highly correlated predictors (|r| > 0.8)
<- cor(select(data_train_processed, where(is.numeric)))
cor_matrix <- names(which(apply(cor_matrix, 2, function(x) any(abs(x) > 0.8 & abs(x) < 1))))
high_corr
#data_train_processed <- data_train_processed %>% select(-all_of(high_corr))
<- data_train_processed %>% select(-c("weight", "waist"))
data_train_processed
# Dummy encode categorical variables
<- dummy_cols(data_train_processed,
data_train_processed select_columns = c("location", "gender", "frame"),
remove_selected_columns = TRUE)
# Separate outcome and predictors
<- data_train_processed$BMI
y_train <- data_train_processed %>% select(-BMI)
x_train
# Scale predictors
<- scale(x_train)
x_train_scaled <- attr(x_train_scaled, "scaled:center")
train_means <- attr(x_train_scaled, "scaled:scale") train_sds
2.4 Prepare Test Data with Same Processing
# ---- Process Test Data ----
<- data_test %>%
data_test_processed mutate(
height = round(height * inch2m, 2),
weight = round(weight * pound2kg, 2),
glu = log(stab.glu)
%>%
) select(-stab.glu, -id)
# Dummy encode categorical variables
<- dummy_cols(data_test_processed,
data_test_processed select_columns = c("location", "gender", "frame"),
remove_selected_columns = TRUE)
# Remove same columns as in training
<- data_test_processed %>%
data_test_processed select(colnames(data_train_processed))
# Ensure same column order as training
<- data_test_processed %>% select(-BMI)
x_test <- x_test[, colnames(x_train)]
x_test
# Apply training set scaling
<- scale(x_test, center = train_means, scale = train_sds)
x_test_scaled <- data_test_processed$BMI y_test
2.5 Lasso Regression with Cross-Validation
# Fit Lasso regression with 10-fold CV
set.seed(123)
<- cv.glmnet(x_train_scaled, y_train, alpha = 1, standardize = FALSE)
cv_model
# Plot cross-validation error
plot(cv_model)
# Best lambda value
<- cv_model$lambda.min
best_lambda cat("Best lambda:", best_lambda)
Best lambda: 0.03330154
2.6 Evaluate Model on Test Data
# Predict
<- predict(cv_model, s = best_lambda, newx = x_test_scaled)
pred_test
# RMSE
<- sqrt(mean((pred_test - y_test)^2))
rmse cat("RMSE on test data:", rmse)
RMSE on test data: 2.729465
# Correlation
cor(pred_test, y_test)
[,1]
s1 0.8872908
# Scatter plot: Predicted vs Actual
plot(y_test, pred_test,
xlab = "Actual BMI", ylab = "Predicted BMI", pch = 19,
main = "Predicted vs Actual BMI")
abline(0, 1, col = "red", lwd = 2)
2.7 Variable Importance Plot
# Extract coefficients
<- coef(cv_model, s = best_lambda)
coef_matrix <- as.data.frame(as.matrix(coef_matrix))
coef_df $feature <- rownames(coef_df)
coef_dfcolnames(coef_df)[1] <- "coefficient"
# Filter out intercept and zero coefficients
<- coef_df %>%
coef_df filter(feature != "(Intercept)", coefficient != 0) %>%
mutate(abs_coef = abs(coefficient)) %>%
arrange(desc(abs_coef))
# Plot
ggplot(coef_df, aes(x = reorder(feature, abs_coef), y = abs_coef)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Variable Importance (Lasso Coefficients)",
x = "Feature", y = "Absolute Coefficient") +
theme_minimal()