Exercises

Exercise 1 (KNN with k-fold cross-validation) Modify the demo example to select best value of \(k\) using 5-fold cross validation. Do we get a different value of the best \(k\)? Does it improve the performance on the test data?

Hint: you can use “create_folds()” function from “library(splitTools)” to create the folds.

Answers to exercises

Solution. Exercise 1

# load libraries
library(tidyverse)
library(splitTools)
library(kknn)

# input data
input_diabetes <- read_csv("data/data-diabetes.csv")

# clean data
inch2cm <- 2.54
pound2kg <- 0.45
data_diabetes <- input_diabetes %>%
  mutate(height  = height * inch2cm / 100, height = round(height, 2)) %>% 
  mutate(waist = waist * inch2cm) %>% 
  mutate(weight = weight * pound2kg, weight = round(weight, 2)) %>%
  mutate(BMI = weight / height^2, BMI = round(BMI, 2)) %>%
  mutate(obese = cut(BMI, breaks = c(0, 29.9, 100), labels = c("No", "Yes"))) %>%
  mutate(diabetic = ifelse(glyhb > 7, "Yes", "No"), diabetic = factor(diabetic, levels = c("No", "Yes"))) %>%
  mutate(location = factor(location)) %>%
  mutate(frame = factor(frame)) %>%
  mutate(gender = factor(gender))
  
# select data for KNN
data_input <- data_diabetes %>%
  select(obese, waist, hdl) %>%
  na.omit()

# set random seed
randseed <- 123
set.seed(randseed)

# split data into other (non-test) and test
splits <- partition(data_input$obese, p = c(other = 0.80, test = 0.20))
data_test <- data_input[splits$test, ]
data_other <- data_input[splits$other, ]

# create train and validation folds
kfolds_train <- create_folds(data_other$obese, k = 5, seed = randseed)
kfolds_valid <- create_folds(data_other$obese, k = 5, 
                             invert = TRUE, # gives back indices of the validation samples
                             seed = randseed) # OBS! use the same seed as above in kfolds_train()


# prepare parameters search space
n <- nrow(data_other)
k_values <- seq(1, 100, 2) # check every odd value of k between 1 and 50

# allocate empty matrix to collect overall classification rate for each k and 5-folds
folds <- 5
cls_rate <- matrix(data = NA, ncol = folds, nrow = length(k_values))
colnames(cls_rate) <- paste("kfold", 1:folds, sep="")
rownames(cls_rate) <- paste("k", k_values, sep="")

for (k in seq_along(k_values))
{
  # for every value of k 
  # fit model on every train fold and evaluate on every validation fold
  for (f in 1:folds){
    
    data_train <- data_other[kfolds_train[[f]], ]
    data_valid <- data_other[kfolds_valid[[f]], ]
    
    # fit model given k value
    model <- kknn(obese ~., data_train, data_valid, 
                k = k_values[k], 
                kernel = "rectangular")
    
    # extract predicted class (predicted obesity status)
    cls_pred <- model$fitted.values
  
    # define actual class (actual obesity status)
    cls_true <- data_valid$obese
  
    # calculate overall classification rate
    cls_rate[k, f] <- sum((cls_pred == cls_true))/length(cls_pred)
    
  }
  
}

# plot average classification rate (across folds) as a function of k
cls_rate_avg <- apply(cls_rate, 1, mean)
plot(k_values, cls_rate_avg, type="l", xlab="k", ylab="cls rate (avg)")


# For which value of k do we reach the highest classification rate?
k_best <- k_values[which.max(cls_rate_avg)]
print(k_best)
## [1] 71

# How would our model perform on the future data using the optimal k?
model_final <- kknn(obese ~., data_other, data_test, k=k_best, kernel = "rectangular")
cls_pred <- model_final$fitted.values
cls_true <- data_test$obese

cls_rate <- sum((cls_pred == cls_true))/length(cls_pred)
print(cls_rate)
## [1] 0.8481013