Hands-on analysis of actual data is the best way to learn R programming. This page contains some data sets that you can use to explore what you have learned in this course. For each data set, a brief description as well as download instructions are provided.
Try to focus on using the tools from the course to explore the data, rather than worrying about producing a perfect report with a coherent analysis workflow.
On the last day you will present your Rmd file (or rather, the resulting html report) and share with the class what your data was about.
penguins <- read.table("https://vincentarelbundock.github.io/Rdatasets/csv/heplots/peng.csv", header = T, sep = ",")
str(penguins)
## 'data.frame': 333 obs. of 9 variables:
## $ rownames : int 1 2 3 4 5 6 7 8 9 10 ...
## $ species : chr "Adelie" "Adelie" "Adelie" "Adelie" ...
## $ island : chr "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
## $ bill_length : num 39.1 39.5 40.3 36.7 39.3 38.9 39.2 41.1 38.6 34.6 ...
## $ bill_depth : num 18.7 17.4 18 19.3 20.6 17.8 19.6 17.6 21.2 21.1 ...
## $ flipper_length: int 181 186 195 193 190 181 195 182 191 198 ...
## $ body_mass : int 3750 3800 3250 3450 3650 3625 4675 3200 3800 4400 ...
## $ sex : chr "m" "f" "f" "f" ...
## $ year : int 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
library(dplyr)
# this will download the csv file directly from the web
drinks <- read.table("https://vincentarelbundock.github.io/Rdatasets/csv/stevedata/nesarc_drinkspd.csv", header = T, sep = ",")
# the lines below will take a sample from the full data set
set.seed(seed = 2)
drinks <- sample_n(drinks, size = 3000, replace = F)
# and here we check the structure of the data
str(drinks)
## 'data.frame': 3000 obs. of 9 variables:
## $ rownames : int 11014 36044 15657 11851 14800 4914 25399 9864 32033 23401 ...
## $ idnum : int 11014 36044 15657 11851 14800 4914 25399 9864 32033 23401 ...
## $ ethrace2a: int 5 1 1 1 1 2 1 5 1 1 ...
## $ region : int 3 1 2 1 2 3 3 2 4 1 ...
## $ age : int 56 55 42 84 18 42 72 39 23 61 ...
## $ sex : int 1 1 1 1 1 1 1 1 0 1 ...
## $ marital : int 1 4 4 3 6 1 3 6 1 5 ...
## $ educ : int 4 4 3 1 2 2 2 2 2 3 ...
## $ s2aq8b : int NA 1 3 NA NA NA NA NA 4 NA ...
library(dplyr)
# this will download the csv file directly from the web
crashes <- read.table("https://vincentarelbundock.github.io/Rdatasets/csv/DAAG/nassCDS.csv", header = T, sep = ",")
# the lines below will take a sample from the full data set
set.seed(seed = 2)
crashes <- sample_n(crashes, size = 3000, replace = F)
# and here we check the structure of the data
str(crashes)
## 'data.frame': 3000 obs. of 16 variables:
## $ rownames : int 12117 13263 4806 11014 8465 21853 3276 3453 15657 17074 ...
## $ dvcat : chr "10-24" "25-39" "10-24" "10-24" ...
## $ weight : num 5363.9 29.5 107.4 194.2 98.9 ...
## $ dead : chr "alive" "alive" "alive" "alive" ...
## $ airbag : chr "none" "none" "none" "none" ...
## $ seatbelt : chr "belted" "belted" "none" "belted" ...
## $ frontal : int 0 1 0 0 1 1 1 1 0 0 ...
## $ sex : chr "f" "m" "f" "f" ...
## $ ageOFocc : int 20 38 20 59 40 19 30 43 39 31 ...
## $ yearacc : int 1999 2000 1998 1999 1999 2002 1997 1997 2000 2000 ...
## $ yearVeh : int 1984 1990 1991 1985 1996 2001 1989 1994 1990 1983 ...
## $ abcat : chr "unavail" "unavail" "unavail" "unavail" ...
## $ occRole : chr "driver" "driver" "pass" "driver" ...
## $ deploy : int 0 0 0 0 1 1 0 1 0 0 ...
## $ injSeverity: int 0 2 0 3 1 0 3 2 3 3 ...
## $ caseid : chr "75:85:2" "4:115:1" "9:1:2" "48:61:1" ...
library(dplyr)
# this will download the csv file directly from the web
gapminder <- read.table("https://vincentarelbundock.github.io/Rdatasets/csv/dslabs/gapminder.csv", header = T, sep = ",")
# here we filter the data to remove anything before the year 2000
gapminder <- gapminder |> filter(year >= 2000)
# and here we check the structure of the data
str(gapminder)
## 'data.frame': 1520 obs. of 10 variables:
## $ rownames : int 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 ...
## $ country : chr "Albania" "Algeria" "Angola" "Antigua and Barbuda" ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ infant_mortality: num 23.2 33.9 128.3 13.8 18 ...
## $ life_expectancy : num 74.7 73.3 52.3 73.8 74.2 ...
## $ fertility : num 2.38 2.51 6.84 2.32 2.48 1.3 1.87 1.76 1.37 2.05 ...
## $ population : int 3121965 31183658 15058638 77648 37057453 3076098 90858 19107251 8050884 8117742 ...
## $ gdp : num 3.69e+09 5.48e+10 9.13e+09 8.03e+08 2.84e+11 ...
## $ continent : chr "Europe" "Africa" "Africa" "Americas" ...
## $ region : chr "Southern Europe" "Northern Africa" "Middle Africa" "Caribbean" ...
library(dplyr)
# this will download the csv file directly from the web
stackoverflow <- read.table("https://vincentarelbundock.github.io/Rdatasets/csv/modeldata/stackoverflow.csv", header = T, sep = ",")
# the lines below will take a sample from the full data set
set.seed(2)
stackoverflow <- sample_n(stackoverflow, size = 3000)
# and here we check the structure of the data
str(stackoverflow)
## 'data.frame': 3000 obs. of 22 variables:
## $ rownames : int 3925 5071 4806 2822 4512 4488 273 5469 3276 3453 ...
## $ Country : chr "Germany" "United States" "United States" "United States" ...
## $ Salary : num 80645 135000 85000 127000 4405 ...
## $ YearsCodedJob : int 19 20 5 20 2 4 3 3 5 3 ...
## $ OpenSource : int 1 1 1 1 0 0 0 1 0 0 ...
## $ Hobby : int 1 1 1 1 0 1 1 1 0 1 ...
## $ CompanySizeNumber : int 10000 10000 5000 20 1 100 1000 20 1000 10000 ...
## $ Remote : chr "Not remote" "Not remote" "Not remote" "Remote" ...
## $ CareerSatisfaction : int 10 8 7 7 8 6 8 5 8 8 ...
## $ Data_scientist : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Database_administrator : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Desktop_applications_developer : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Developer_with_stats_math_background: int 1 0 0 0 0 0 0 0 0 0 ...
## $ DevOps : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Embedded_developer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Graphic_designer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Graphics_programming : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Machine_learning_specialist : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Mobile_developer : int 1 0 0 0 1 0 0 0 0 0 ...
## $ Quality_assurance_engineer : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Systems_administrator : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Web_developer : int 1 1 0 0 1 1 1 1 1 0 ...
library(dplyr)
# this will download the csv file directly from the web
doctor <- read.table("https://vincentarelbundock.github.io/Rdatasets/csv/AER/DoctorVisits.csv", header = T, sep = ",")
# the lines below will take a sample from the full data set
set.seed(2)
doctor <- sample_n(doctor, size = 3000)
# and here we check the structure of the data
str(doctor)
## 'data.frame': 3000 obs. of 13 variables:
## $ rownames : int 3925 5071 4806 2822 4512 4488 273 3276 3453 690 ...
## $ visits : int 0 0 0 0 0 0 1 0 0 1 ...
## $ gender : chr "male" "female" "male" "male" ...
## $ age : num 0.27 0.67 0.32 0.19 0.22 0.22 0.22 0.37 0.67 0.62 ...
## $ income : num 0.15 0.25 0.9 0.65 0.35 0.75 0.75 0.25 0.35 0.25 ...
## $ illness : int 0 1 0 1 0 1 0 1 1 5 ...
## $ reduced : int 0 0 0 0 0 0 0 14 0 2 ...
## $ health : int 0 0 0 0 0 0 1 4 0 2 ...
## $ private : chr "no" "no" "yes" "yes" ...
## $ freepoor : chr "no" "no" "no" "no" ...
## $ freerepat: chr "no" "yes" "no" "no" ...
## $ nchronic : chr "yes" "yes" "no" "no" ...
## $ lchronic : chr "no" "no" "no" "no" ...
library(dplyr)
library(lubridate)
# this will download the file to your working directory
download.file(url = "https://maven-datasets.s3.amazonaws.com/Video+Game+Sales/Video+Game+Sales.zip", destfile = "video_game_sales.zip")
# this will unzip the file and read it into R
videogames <- read.table(unz(filename = "vgchartz-2024.csv", "video_game_sales.zip"), header = T, sep = ",", quote = "\"", fill = T)
# this will select rows corresponding to years 2001 and 2002
videogames <- filter(videogames, year(as_date(release_date)) %in% c(2001,2002))
# and here we check the structure of the data
str(videogames)
## 'data.frame': 3201 obs. of 14 variables:
## $ img : chr "/games/boxart/827563ccc.jpg" "/games/boxart/3570928ccc.jpg" "/games/boxart/7583871ccc.jpg" "/games/boxart/9261584ccc.jpg" ...
## $ title : chr "Grand Theft Auto: Vice City" "Grand Theft Auto III" "Medal of Honor: Frontline" "Crash Bandicoot: The Wrath of Cortex" ...
## $ console : chr "PS2" "PS2" "PS2" "PS2" ...
## $ genre : chr "Action" "Action" "Shooter" "Platform" ...
## $ publisher : chr "Rockstar Games" "Rockstar Games" "Electronic Arts" "Universal Interactive" ...
## $ developer : chr "Rockstar North" "DMA Design" "EA Los Angeles" "Traveller's Tales" ...
## $ critic_score: num 9.6 9.5 9 6.9 8.3 8.2 9.1 NA 9.4 7.3 ...
## $ total_sales : num 16.15 13.1 6.83 5.42 4.67 ...
## $ na_sales : num 8.41 6.99 2.93 2.07 1.94 2.71 2.66 3 3.36 2.03 ...
## $ jp_sales : num 0.47 0.3 0.17 0.24 0.08 0.03 0.01 0.05 0.01 NA ...
## $ pal_sales : num 5.49 4.51 2.75 2.29 1.95 1.51 1.29 1.11 0.21 1.56 ...
## $ other_sales : num 1.78 1.3 0.99 0.82 0.7 0.23 0.46 0.07 0.56 0.17 ...
## $ release_date: chr "2002-10-28" "2001-10-23" "2002-05-28" "2001-10-29" ...
## $ last_update : chr "" "" "" "" ...
library(dplyr)
# this will download the file to your working directory
download.file(url = "https://maven-datasets.s3.amazonaws.com/LEGO+Sets/LEGO+Sets.zip", destfile = "lego.csv.zip")
# this will unzip the file and read it into R
lego <- read.table(unz(filename = "lego_sets.csv", "lego.csv.zip"), header = T, sep = ",", quote = "\"", fill = T)
# this will select rows corresponding to years 2000-2009
lego <- filter(lego, year %in% seq(2000,2009,1))
# and here we check the structure of the data
str(lego)
## 'data.frame': 4304 obs. of 14 variables:
## $ set_id : chr "1086-1" "1177-1" "1196-1" "1197-1" ...
## $ name : chr "Bulk Bucket" "Santa's Truck" "Telekom Race Cyclist" "Telekom Race Cyclist and Television Motorbike" ...
## $ year : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ theme : chr "Duplo" "Town" "Town" "Town" ...
## $ subtheme : chr "" "Special" "Telekom" "Telekom" ...
## $ themeGroup : chr "Pre-school" "Modern day" "Modern day" "Modern day" ...
## $ category : chr "Normal" "Normal" "Normal" "Normal" ...
## $ pieces : int 48 27 7 26 81 129 10 27 26 23 ...
## $ minifigs : int NA 1 1 3 3 8 2 NA NA 1 ...
## $ agerange_min : int NA NA NA NA NA NA NA NA NA NA ...
## $ US_retailPrice: num NA NA NA NA NA NA NA NA NA NA ...
## $ bricksetURL : chr "https://brickset.com/sets/1086-1" "https://brickset.com/sets/1177-1" "https://brickset.com/sets/1196-1" "https://brickset.com/sets/1197-1" ...
## $ thumbnailURL : chr "https://images.brickset.com/sets/small/1086-1.jpg" "https://images.brickset.com/sets/small/1177-1.jpg" "https://images.brickset.com/sets/small/1196-1.jpg" "https://images.brickset.com/sets/small/1197-1.jpg" ...
## $ imageURL : chr "https://images.brickset.com/sets/images/1086-1.jpg" "https://images.brickset.com/sets/images/1177-1.jpg" "https://images.brickset.com/sets/images/1196-1.jpg" "https://images.brickset.com/sets/images/1197-1.jpg" ...