# create data recipe (feature engineering)
<- 2.54/100
inch2m <- 0.45
pound2kg
<- recipe(BMI ~ ., data = data_other) %>%
data_recipe update_role(id, new_role = "sampleID") %>%
step_mutate(height = height * inch2m, height = round(height, 2)) %>% # convert height to meters
step_mutate(weight = weight * pound2kg, weight = round(weight, 2)) %>% # convert weight to kg
step_rename(glu = stab.glu) %>% # rename stab.glu to glu
step_log(glu) %>% #ln transform glucose
step_zv(all_numeric()) %>% # removes variables that are highly sparse and unbalanced (if found)
step_corr(all_numeric(), -all_outcomes(), -has_role("sampleID"), threshold = 0.8) %>% # removes variables with large absolute correlations with other variables (if found)
step_dummy(location, gender, frame) %>% # convert categorical variables to dummy variables
step_normalize(all_numeric(), -all_outcomes(), -has_role("sampleID"), skip = FALSE)
# you can implement more steps: see https://recipes.tidymodels.org/reference/index.html
# print recipe
data_recipe
# check if recipe is doing what it is supposed to do
# i.e. bake the data
<- data_recipe %>%
data_other_prep prep() %>%
bake(new_data = NULL)
## bake test data
<- data_recipe %>%
data_test_prep prep() %>%
bake(new_data = data_test)
# preview baked data
print(head(data_other_prep))
## # A tibble: 6 × 17
## id chol glu hdl ratio glyhb age height bp.1s bp.1d hip
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1045 -0.319 -0.539 -0.830 0.486 -0.172 -0.645 -0.474 -1.16 -0.552 -1.66
## 2 1271 0.449 -1.09 -0.324 0.320 -0.458 -1.39 -1.29 -1.59 -1.00 -0.914
## 3 1277 -0.657 -0.572 2.32 -1.45 -0.642 -0.337 1.56 0.286 2.15 -1.29
## 4 1303 -0.590 -0.410 -0.436 -0.178 -0.518 0.341 0.545 -0.309 0.500 -1.47
## 5 1309 -0.206 -0.347 0.687 -0.730 -0.860 -1.32 0.0357 -0.735 -0.402 -1.66
## 6 1315 -0.793 -0.572 0.350 -0.841 0.225 0.649 1.26 -0.565 -1.45 -1.29
## # … with 6 more variables: time.ppn <dbl>, BMI <dbl>, location_Louisa <dbl>,
## # gender_male <dbl>, frame_medium <dbl>, frame_small <dbl>
2 Demo: a predictive modelling case study
Let’s use tidymodels
framework to run small predictive case study trying to build a predictive model for BMI using our diabetes
data set. We will use:
rsamples
for splitting data into test and non-test, as well as creating cross-validation foldsrecipes
for feature engineering, e.g. changing from imperial to metric measurements, removing irrelevant and highly correlated featuresparsnip
to specify Lasso regression modeltune
to optimize search space for lambda valuesyardstick
to assess predictionsworkflows
to put all the step together
2.1 Data import & EDA
# load libraries
library(tidyverse)
library(tidymodels)
library(ggcorrplot)
library(reshape2)
library(vip)
# import raw data
<- read_csv("data/data-diabetes.csv")
input_diabetes
# create BMI variable
<- 703 # conversion factor to calculate BMI from inches and pounds BMI = weight (lb) / [height (in)]2 x 703
conv_factor <- input_diabetes %>%
data_diabetes mutate(BMI = weight / height^2 * 703, BMI = round(BMI, 2)) %>%
relocate(BMI, .after = id)
# preview data
glimpse(data_diabetes)
## Rows: 403
## Columns: 20
## $ id <dbl> 1000, 1001, 1002, 1003, 1005, 1008, 1011, 1015, 1016, 1022, 1…
## $ BMI <dbl> 22.13, 37.42, 48.37, 18.64, 27.82, 26.50, 28.20, 34.33, 24.51…
## $ chol <dbl> 203, 165, 228, 78, 249, 248, 195, 227, 177, 263, 242, 215, 23…
## $ stab.glu <dbl> 82, 97, 92, 93, 90, 94, 92, 75, 87, 89, 82, 128, 75, 79, 76, …
## $ hdl <dbl> 56, 24, 37, 12, 28, 69, 41, 44, 49, 40, 54, 34, 36, 46, 30, 4…
## $ ratio <dbl> 3.6, 6.9, 6.2, 6.5, 8.9, 3.6, 4.8, 5.2, 3.6, 6.6, 4.5, 6.3, 6…
## $ glyhb <dbl> 4.31, 4.44, 4.64, 4.63, 7.72, 4.81, 4.84, 3.94, 4.84, 5.78, 4…
## $ location <chr> "Buckingham", "Buckingham", "Buckingham", "Buckingham", "Buck…
## $ age <dbl> 46, 29, 58, 67, 64, 34, 30, 37, 45, 55, 60, 38, 27, 40, 36, 3…
## $ gender <chr> "female", "female", "female", "male", "male", "male", "male",…
## $ height <dbl> 62, 64, 61, 67, 68, 71, 69, 59, 69, 63, 65, 58, 60, 59, 69, 6…
## $ weight <dbl> 121, 218, 256, 119, 183, 190, 191, 170, 166, 202, 156, 195, 1…
## $ frame <chr> "medium", "large", "large", "large", "medium", "large", "medi…
## $ bp.1s <dbl> 118, 112, 190, 110, 138, 132, 161, NA, 160, 108, 130, 102, 13…
## $ bp.1d <dbl> 59, 68, 92, 50, 80, 86, 112, NA, 80, 72, 90, 68, 80, NA, 66, …
## $ bp.2s <dbl> NA, NA, 185, NA, NA, NA, 161, NA, 128, NA, 130, NA, NA, NA, N…
## $ bp.2d <dbl> NA, NA, 92, NA, NA, NA, 112, NA, 86, NA, 90, NA, NA, NA, NA, …
## $ waist <dbl> 29, 46, 49, 33, 44, 36, 46, 34, 34, 45, 39, 42, 35, 37, 36, 3…
## $ hip <dbl> 38, 48, 57, 38, 41, 42, 49, 39, 40, 50, 45, 50, 41, 43, 40, 4…
## $ time.ppn <dbl> 720, 360, 180, 480, 300, 195, 720, 1020, 300, 240, 300, 90, 7…
# run basic EDA
# note: we have seen descriptive statistics and plots during EDA session
# note: so here we only look at missing data and correlation
# calculate number of missing data per variable
<- data_diabetes %>%
data_na summarise(across(everything(), ~ sum(is.na(.))))
# make a table with counts sorted from highest to lowest
<- data_na %>%
data_na_long pivot_longer(-id, names_to = "variable", values_to = "count") %>%
arrange(desc(count))
# make a column plot to visualize the counts
%>%
data_na_long ggplot(aes(x = variable, y = count)) +
geom_col(fill = "blue4") +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
# calculate correlation between numeric variables
<- data_diabetes %>%
data_cor ::select(-id) %>%
dplyr::select(where(is.numeric)) %>%
dplyrcor(use = "pairwise.complete.obs")
# visualize correlation via heatmap
ggcorrplot(data_cor, hc.order = TRUE, lab = FALSE)
# bass on the number of missing data, let's delete bp.2s, bp.2d
# and use complete-cases analysis
<- data_diabetes %>%
data_diabetes_narm ::select(-bp.2s, -bp.2d) %>%
dplyrna.omit()
2.2 Data splitting
# use tidymodels framework to fit Lasso regression model for predicting BMI
# using repeated cross-validation to tune lambda value in L1 penalty term
# select random seed value
<- 123
myseed
# split data into non-test (other) and test (80% s)
set.seed(myseed)
<- initial_split(data_diabetes_narm, strata = BMI, prop = 0.8) # holds splitting info
data_split <- data_split %>% training() # creates non-test set (function is called training but it refers to non-test part)
data_other <- data_split %>% testing() # creates test set
data_test
# prepare repeated cross-validation splits with 5 folds repeated 3 times
set.seed(myseed)
<- vfold_cv(data_other,
data_folds v = 5,
repeats = 3,
strata = BMI)
# check the split
dim(data_diabetes)
## [1] 403 20
dim(data_other)
## [1] 291 18
dim(data_test)
## [1] 75 18
# check BMI distributions in data splits
par(mfrow=c(3,1))
hist(data_diabetes$BMI, xlab = "", main = "BMI: all", 50)
hist(data_other$BMI, xlab = "", main = "BMI: non-test", 50)
hist(data_test$BMI, xlab = "", main = "BMI: test", 50)
2.3 Feature engineering
2.4 Lasso regression
# define model
<- linear_reg(penalty = tune(), mixture = 1) %>%
model set_engine("glmnet") %>%
set_mode("regression")
# create workflow with data recipe and model
<- workflow() %>%
wf add_model(model) %>%
add_recipe(data_recipe)
# define parameters range for tuning
<- grid_regular(penalty(), levels = 25)
grid_lambda
# tune lambda
<- wf %>%
model_tune tune_grid(resamples = data_folds,
grid = grid_lambda)
# show metrics average across folds
%>%
model_tune collect_metrics(summarize = TRUE)
## # A tibble: 50 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 e-10 rmse standard 2.48 15 0.142 Preprocessor1_Model01
## 2 1 e-10 rsq standard 0.851 15 0.0143 Preprocessor1_Model01
## 3 2.61e-10 rmse standard 2.48 15 0.142 Preprocessor1_Model02
## 4 2.61e-10 rsq standard 0.851 15 0.0143 Preprocessor1_Model02
## 5 6.81e-10 rmse standard 2.48 15 0.142 Preprocessor1_Model03
## 6 6.81e-10 rsq standard 0.851 15 0.0143 Preprocessor1_Model03
## 7 1.78e- 9 rmse standard 2.48 15 0.142 Preprocessor1_Model04
## 8 1.78e- 9 rsq standard 0.851 15 0.0143 Preprocessor1_Model04
## 9 4.64e- 9 rmse standard 2.48 15 0.142 Preprocessor1_Model05
## 10 4.64e- 9 rsq standard 0.851 15 0.0143 Preprocessor1_Model05
## # … with 40 more rows
# plot k-folds results across lambda range
%>%
model_tune collect_metrics() %>%
::filter(.metric == "rmse") %>%
dplyrggplot(aes(penalty, mean, color = .metric)) +
geom_errorbar(aes( ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
scale_x_log10() +
geom_line(linewidth = 1.5) +
theme_bw() +
theme(legend.position = "none") +
scale_color_brewer(palette = "Set1")
# best lambda value (min. RMSE)
<- model_tune %>%
model_best select_best("rmse")
print(model_best)
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 0.0562 Preprocessor1_Model22
# finalize workflow with tuned model
<- wf %>%
wf_final finalize_workflow(model_best)
# last fit
<- wf_final %>%
fit_final last_fit(split = data_split)
# final predictions
<- fit_final %>% collect_predictions() # predicted BMI
y_test_pred
# final predictions: performance on test (unseen data)
%>% collect_metrics()
fit_final ## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2.79 Preprocessor1_Model1
## 2 rsq standard 0.857 Preprocessor1_Model1
# plot predictions vs. actual for test data
plot(data_test$BMI, y_test_pred$.pred, xlab="BMI (actual)", ylab = "BMI (predicted)", las = 1, pch = 19)
# correlation between predicted and actual BMI values for test data
cor(data_test$BMI, y_test_pred$.pred)
## [1] 0.9256857
# re-fit model on all non-test data
<- wf_final %>%
model_final fit(data_other)
# show final model
tidy(model_final)
## # A tibble: 16 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 28.7 0.0562
## 2 chol 0 0.0562
## 3 glu 0.0229 0.0562
## 4 hdl 0 0.0562
## 5 ratio 0.335 0.0562
## 6 glyhb -0.0512 0.0562
## 7 age -0.257 0.0562
## 8 height -1.86 0.0562
## 9 bp.1s -0.294 0.0562
## 10 bp.1d 0.203 0.0562
## 11 hip 5.60 0.0562
## 12 time.ppn 0 0.0562
## 13 location_Louisa -0.160 0.0562
## 14 gender_male 1.07 0.0562
## 15 frame_medium -0.320 0.0562
## 16 frame_small -0.530 0.0562
# plot variables ordered by importance (highest abs(coeff))
%>%
model_final extract_fit_parsnip() %>%
vip(geom = "point") +
theme_bw()