2 Demo: k-nearest neighbors (KNN)

Let’s try to build a classifier to predict obesity (Obese vs Non-obese) given our diabetes data set. To start simple:

we will see how well we can predict obesity given waist and hdl variables
we will use data splitting into train, validation and test, i.e. not cross-validation with the help of splitTools() library
we will use KNN algorithm as implemented in kknn() function in library(kknn)

Reading in data

# load libraries
library(tidyverse)
library(splitTools)
library(kknn)

# input data
input_diabetes <- read_csv("data/data-diabetes.csv")

# clean data
inch2cm <- 2.54
pound2kg <- 0.45
data_diabetes <- input_diabetes %>%
  mutate(height  = height * inch2cm / 100, height = round(height, 2)) %>% 
  mutate(waist = waist * inch2cm) %>% 
  mutate(weight = weight * pound2kg, weight = round(weight, 2)) %>%
  mutate(BMI = weight / height^2, BMI = round(BMI, 2)) %>%
  mutate(obese= cut(BMI, breaks = c(0, 29.9, 100), labels = c("No", "Yes"))) %>%
  mutate(diabetic = ifelse(glyhb > 7, "Yes", "No"), diabetic = factor(diabetic, levels = c("No", "Yes"))) %>%
  mutate(location = factor(location)) %>%
  mutate(frame = factor(frame)) %>%
  mutate(gender = factor(gender))
  
# select data for KNN
data_input <- data_diabetes %>%
  select(obese, waist, hdl) %>%
  na.omit()

# How many obese and non-obese in our data set?
data_input %>%
  count(obese)
## # A tibble: 2 × 2
##   obese     n
##   <fct> <int>
## 1 No      250
## 2 Yes     144

# preview data
glimpse(data_input)
## Rows: 394
## Columns: 3
## $ obese <fct> No, Yes, Yes, No, No, No, No, Yes, No, Yes, No, Yes, Yes, Yes, N…
## $ waist <dbl> 73.66, 116.84, 124.46, 83.82, 111.76, 91.44, 116.84, 86.36, 86.3…
## $ hdl   <dbl> 56, 24, 37, 12, 28, 69, 41, 44, 49, 40, 54, 34, 36, 46, 30, 47, …

data_input %>%
  ggplot(aes(x = waist, y = hdl, fill = obese)) + 
  geom_point(shape=21, alpha = 0.7, size = 2) + 
  theme_bw() + 
  scale_fill_manual(values = c("blue3", "brown1")) + 
  theme(legend.position = "top")

Splitting data

# split data into train (40%), validation (40%) and test (20%)
# stratify by obese
randseed <- 123
set.seed(randseed)
inds <- partition(data_input$obese, p = c(train = 0.4, valid = 0.4, test = 0.2), seed = randseed)
str(inds)
## List of 3
##  $ train: int [1:158] 6 8 11 16 17 22 26 28 32 35 ...
##  $ valid: int [1:157] 1 3 5 7 9 10 13 14 15 24 ...
##  $ test : int [1:79] 2 4 12 18 19 20 21 23 27 30 ...
data_train <- data_input[inds$train, ]
data_valid <- data_input[inds$valid,]
data_test <- data_input[inds$test, ]

# check dimensions of data
data_train %>% dim()
## [1] 158   3
data_valid %>% dim()
## [1] 157   3
data_test %>% dim()
## [1] 79  3

# check distribution of obese and non-obese
data_train %>%
  group_by(obese) %>%
  count()
## # A tibble: 2 × 2
## # Groups:   obese [2]
##   obese     n
##   <fct> <int>
## 1 No      100
## 2 Yes      58

data_valid %>%
  group_by(obese) %>%
  count()
## # A tibble: 2 × 2
## # Groups:   obese [2]
##   obese     n
##   <fct> <int>
## 1 No      100
## 2 Yes      57

data_test %>%
  group_by(obese) %>%
  count()
## # A tibble: 2 × 2
## # Groups:   obese [2]
##   obese     n
##   <fct> <int>
## 1 No       50
## 2 Yes      29

Training KNN

# prepare parameters search space
n <- nrow(data_train)
k_values <- seq(1, n-1, 2) # check every odd value of k between 1 and number of samples-1

# allocate empty vector to collect overall classification rate for each k
cls_rate <- rep(0, length(k_values)) 

for (l in seq_along(k_values))
{
  
  # fit model given k value
  model <- kknn(obese ~., data_train, data_valid, 
                k = k_values[l], 
                kernel = "rectangular")
  
  # extract predicted class (predicted obesity status)
  cls_pred <- model$fitted.values
  
  # define actual class (actual obesity status)
  cls_true <- data_valid$obese
  
  # calculate overall classification rate
  cls_rate[l] <- sum((cls_pred == cls_true))/length(cls_pred)
  
}

Selecting best \(k\)

# plot classification rate as a function of k
plot(k_values, cls_rate, type="l", xlab="k", ylab="cls rate")

Overall classification rate as a function of k

# For which value of k do we reach the highest classification rate?
k_best <- k_values[which.max(cls_rate)]
print(k_best)
## [1] 51

Final model and performance on future unseen data (test data)

# How would our model perform on the future data using the optimal k?
model_final <- kknn(obese ~., data_train, data_test, k=k_best, kernel = "rectangular")
cls_pred <- model_final$fitted.values
cls_true <- data_test$obese

cls_rate <- sum((cls_pred == cls_true))/length(cls_pred)
print(cls_rate)
## [1] 0.8481013