-03-data transformation
Last updated
Last updated
Link to the module:
More about dplyr: https://dplyr.tidyverse.org/
dplyr cheatseat from RStudio: https://www.rstudio.com/resources/cheatsheets/#dplyr
library(nycflights13)
library(tidyverse)
############
#YOUR TURN!
############
#Are there vignettes for the dplyr package?
#Can you find additional documentation explaining the
#flights data set?
#SOLUTION
# are there vignettes for the dplyr package —> yes, 8 of them
vignette(package = "dplyr")
# additional documentation for the mpg data set
?flights
###############################################
#filter
#Filter values based on defined conditions
###############################################
#Filter based on one or more variables
View(flights)
filter(flights, month == 1)
View(filter(flights, month == 1))
filter(flights, month == 1, day == 1)
View(filter(flights, month == 1, day == 1))
filter(flights, month == 1, day == 1, dep_delay > 0)
View(filter(flights, month == 1, day == 1, dep_delay > 0))
#SAVE NEW DATA FRAME
#• Save filter data frame using assignment operator (<-)
dec25 <- filter(flights, month == 12, day == 25)
dec25
####
#LOGICAL TESTS
####
12 == 12
#[1] TRUE
12 <= c(12, 11)
#[1] TRUE FALSE
12 %in% c(12, 11, 8) #Group membership
#[1] TRUE
x <- c(12, NA, 11, NA, 8)
is.na(x)
#[1] FALSE TRUE FALSE TRUE FALSE
##?Comparison##
# < Less than
# > Greater than
# == Equal to
# <= Less than or equal to
# >= Greater than or equal to
# != Not equal to
# %in% Group membership
# is.na Is NA
# !is.na Is not NA
####
#COMPARISON
####
#What will these operations produce?
filter(flights, month == 12)
filter(flights, month != 12)
filter(flights, month %in% c(11, 12))
filter(flights, arr_delay <= 120)
filter(flights, !(arr_delay <= 120))
filter(flights, is.na(tailnum))
####
#MULTIPLE LOGICAL TESTS
####
12 == 12 & 12 < 14
#[1] TRUE
12 == 12 & 12 < 10
#[1] FALSE
12 == 12 | 12 < 10
#[1] TRUE
any(12 == 12, 12 < 10)
#[1] TRUE
all(12 == 12, 12 < 10)
#[1] FALSE
##?base::Logic##
# & boolean and
# | boolean or
# xor exclusively x or y
# ! not
# any any true :at least one is TRUE
# all all true
#####
#MULTIPLE COMPARISONS
####
#Using comma is same as using &
filter(flights, month == 12, day == 25)
filter(flights, month == 12 & day == 25)
#Use %in% as a shortcut for |
filter(flights, month == 11 | month == 12)
filter(flights, month %in% c(11, 12))
#Are these the same????
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
####
#YOUR TURN & SOLUTION
####
# 1: import the data
customer <- read_csv("data/CustomerData.csv")
#Import the data with IDE RStudio
customer<-CustomerData
# 2: filter for female customers only
filter(customer, Gender == "Female")
# 3: filter for female customers that are greater than 45 years old and live in region 3
filter(customer, Gender == "Female", Age > 45, Region == 3)
# 4: filter for female customers that are greater than 45 years old or live in region 3
filter(customer, Gender == "Female", Age > 45 | Region == 3)
#############################
#select
#Select variables of concern
#############################
customer<-CustomerData
####
#SELECTING VARIABLES
###
#Select one or more variables
select(flights, year, month, day)
#Same as:
select(flights, year:day)
####
#SELECTING VARIABLES
####
#Deselect one or more variables
select(flights, -(year:day))
####
#USEFUL select FUNCTIONS
####
# - Select everything but
# : Select range
# contains() Select columns whose name contains a character string
# ends_with() Select columns whose name ends with a string
# everything() Select every column
# matches() Select columns whose name matches a regular expression
# num_range() Select columns named x1, x2, x3, x4, x5
# one_of() Select columns whose names are in a group of names
# starts_with() Select columns whose name starts with a character string
select(flights, ends_with("time"))
select(flights, c(carrier, ends_with("time"), contains("delay")))
####
#VARIABLE PLACEMENT
####
select(flights, time_hour, air_time, everything())
####
#RENAMING VARIABLES
####
rename(flights, ANNOYING = dep_delay)
####
#YOUR TURN! & SOLUTION
####
# 1. select all variables between CustomerID and Gender
select(customer, CustomerID:Gender)
# 2. select all variables except for those between CustomerID and Gender
select(customer, -(CustomerID:Gender))
#3. select CustomerID and all variables that contain the word “Card”
select(customer, CustomerID, contains("Card"))
###############
#arrange
#Reorder data
###############
customer<-CustomerData
####
#ORDERING YOUR DATA
####
#Order data based on one or more variables
arrange(flights, dep_delay)
arrange(flights, dep_delay, arr_delay)
#Reverse the order by using desc()
arrange(flights, desc(dep_delay))
#Note that missing values are always sorted at the end:
df <- tibble(x = c(5, 2, 5, NA))
arrange(df, x)
arrange(df, desc(x))
####
#YOUR TURN! & SOLUTION
####
# 1: select variables
sub_cust <- select(customer, CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth)
# 2: Order sub_cust data by Age and CardSpendMonth (ascending order)
arrange(customer, Age, CardSpendMonth)
# 3: Order sub_cust data by Age (oldest to youngest) and CardSpendMonth (least to most)
arrange(customer, desc(Age), CardSpendMonth)
##########################################################
#mutate
#Create new variables with functions of existing variables
##########################################################
####
#REDUCE OUR DATA
####
#Lets work with a smaller data set
flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time )
flights_sml
####
#CREATE NEW VARIABLES
####
#mutate() creates new variables with functions of existing variables:
mutate(flights_sml, gain = arr_delay - dep_delay, speed = distance / air_time * 60 )
#Note: you can create variables based on columns that you’ve just created:
mutate(flights_sml, gain = arr_delay - dep_delay, hours = air_time / 60, gain_per_hour = gain / hours )
#If you only want to keep the new variables use transmute():
transmute(flights, gain = arr_delay - dep_delay, hours = air_time / 60, gain_per_hour = gain / hours )
####
#MANY USEFUL CREATION FUNCTIONS:
###
#There are a wide variety of functions you can use with mutate()
# +, -, *, /, ^ arithmetic
# x / sum(x) arithmetic w/aggregate functions
# %/%, %% modular arithmetic
# log, exp, sqrt transformations
# lag, lead offsets
# cumsum, cumprod, cum… cum/rolling aggregates
# >, >=, <, <=,
# !=, == logical comparisons
# min_rank, dense_rank, etc ranking
# between are values between a and b?
# ntile bin values into buckets
transmute(flights, normalized_delay = dep_delay / mean(dep_delay, na.rm = TRUE))
transmute(flights, log_air_time = log2(air_time), exp_delay = exp(dep_delay))
transmute(flights, dep_delay = dep_delay, lag_delay = lag(dep_delay), sum_delay = cumsum(dep_delay))
####
#YOUR TURN! & SOLUTION
####
#1: create a ratio variable that computes the ratio of CardSpendMonth to HHIncome
mutate(sub_cust, ratio = CardSpendMonth / HHIncome)
#2: create 2 variables: #
#i) ratio1 = CardSpendMonth / HHIncome
#ii) ratio2 = CardSpendMonth / Age
mutate(sub_cust, ratio1 = CardSpendMonth / HHIncome, ratio2 = CardSpendMonth / Age )
#########################################################
#summarise
#Collapse many values down to a single summary statistic
#########################################################
####
#SUMMARIZING OUR DATA
####
#We can create summary statistics of one or more variables:
summarise(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE))
#We can create summary statistics of one or more variables:
summarise(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE), dep_delay_sd = sd(dep_delay, na.rm = TRUE) )
summarise(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE), dep_delay_sd = sd(dep_delay, na.rm = TRUE), n = n() )
####
#SUMMARY FUNCTIONS
####
#* All take a vector of values and return a single value
#** Blue functions come in dplyr
# min(), max() Minimum and maximum values
# mean() Mean value
# median() Median value
# sum() Sum of values
# var, sd() Variance and standard deviation of a vector
# first() First value in a vector last() Last value in a vector
# nth() Nth value in a vector
# n() The number of values in a vector
# n_distinct() The number of distinct values in a vector
####
#SUMMARIZING GROUPED DATA
####
#Summary statistics become more powerful when we can compare groups:
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
####
#YOUR TURN! & SOLUTION
####
#1: Avg spend across all customers
summarize(sub_cust, Avg_spend = mean(CardSpendMonth, na.rm = TRUE) )
#2: Now compute the average CardSpendMonth for each gender.
by_gender <- group_by(sub_cust, Gender) summarize(by_gender, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
#3: Now compute the average CardSpendMonth for each gender and region. # Which gender and region have the highest average spend?
by_gdr_rgn <- group_by(sub_cust, Gender, Region)
avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
arrange(avg_gdr_rgn, desc(Avg_spend))
####################################################
#pipe operator
#Chaining functions together with the pipe operator
####################################################
####
#STREAMLINING OUR ANALYSIS
####
sub_cust <- select(customer, CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth)
# Going back to our last problem, our code was doing three things:
# 1. grouping by gender and region
# 2. summarizing average spend
# 3. sorting spend by greatest to least
by_gdr_rgn <- group_by(sub_cust, Gender, Region)
avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
arrange(avg_gdr_rgn, desc(Avg_spend))
#We can streamline our code to make it more efficient and legible
library(magrittr)
x <- 1:15
sum(x)
x %>% sum()
#• Lets re-write our code using the pipe (%>%) operator
#• This code does four things in a very efficient & readable manner
sub_cust %>%
group_by(Gender, Region) %>%
summarize(Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) %>%
arrange(desc(Avg_spend))
####
#YOUR TURN! & SOLUTION
####
#Using the pipe operator follow these steps with the sub_cust data:
#1. filter for male customers only
#2. create a new variable: ratio = CardSpendMonth / HHIncome
#3. group this data by age
#4. compute the mean of the new ratio variable by age
#5. sort this output to find the age with the highest ratio of expenditures to income.
sub_cust %>%
filter(Gender == "Male") %>%
mutate(ratio = CardSpendMonth / HHIncome) %>%
group_by(Age) %>%
summarize(Avg_ratio = mean(ratio, na.rm = TRUE)) %>%
arrange(desc(Avg_ratio))