-03-data transformation

Link to the module:

More about dplyr: https://dplyr.tidyverse.org/

dplyr cheatseat from RStudio: https://www.rstudio.com/resources/cheatsheets/#dplyr

library(nycflights13)
library(tidyverse)
############
#YOUR TURN!
############

#Are there vignettes for the dplyr package?
#Can you find additional documentation explaining the
#flights data set?

#SOLUTION
# are there vignettes for the dplyr package —> yes, 8 of them
vignette(package = "dplyr")
# additional documentation for the mpg data set
?flights

FILTER:

###############################################
#filter
#Filter values based on defined conditions
###############################################

#Filter based on one or more variables
View(flights)
filter(flights, month == 1)
View(filter(flights, month == 1))

filter(flights, month == 1, day == 1)
View(filter(flights, month == 1, day == 1))

filter(flights, month == 1, day == 1, dep_delay > 0)
View(filter(flights, month == 1, day == 1, dep_delay > 0))

#SAVE NEW DATA FRAME
#• Save filter data frame using assignment operator (<-)
dec25 <- filter(flights, month == 12, day == 25)
dec25

####
#LOGICAL TESTS
####
12 == 12
#[1] TRUE
12 <= c(12, 11)
#[1] TRUE FALSE
12 %in% c(12, 11, 8) #Group membership
#[1] TRUE
x <- c(12, NA, 11, NA, 8)
is.na(x)
#[1] FALSE TRUE FALSE TRUE FALSE

##?Comparison##
# < Less than
# > Greater than
# == Equal to
# <= Less than or equal to
# >= Greater than or equal to
# != Not equal to
# %in% Group membership
# is.na Is NA
# !is.na Is not NA

####
#COMPARISON
####
#What will these operations produce?
filter(flights, month == 12)
filter(flights, month != 12)
filter(flights, month %in% c(11, 12))
filter(flights, arr_delay <= 120)
filter(flights, !(arr_delay <= 120))
filter(flights, is.na(tailnum))

####
#MULTIPLE LOGICAL TESTS
####
12 == 12 & 12 < 14 
#[1] TRUE
12 == 12 & 12 < 10
#[1] FALSE
12 == 12 | 12 < 10
#[1] TRUE
any(12 == 12, 12 < 10)
#[1] TRUE
all(12 == 12, 12 < 10)
#[1] FALSE

##?base::Logic##
# & boolean and
# | boolean or
# xor exclusively x or y
# ! not
# any any true :at least one is TRUE
# all all true

#####
#MULTIPLE COMPARISONS 
####
#Using comma is same as using &
filter(flights, month == 12, day == 25)
filter(flights, month == 12 & day == 25)

#Use %in% as a shortcut for |
filter(flights, month == 11 | month == 12)
filter(flights, month %in% c(11, 12))

#Are these the same????
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)

####
#YOUR TURN & SOLUTION
####
# 1: import the data
customer <- read_csv("data/CustomerData.csv")
#Import the data with IDE RStudio
customer<-CustomerData
# 2: filter for female customers only
filter(customer, Gender == "Female")
# 3: filter for female customers that are greater than 45 years old and live in region 3
filter(customer, Gender == "Female", Age > 45, Region == 3)
# 4: filter for female customers that are greater than 45 years old or live in region 3
filter(customer, Gender == "Female", Age > 45 | Region == 3)

SELECT:

#############################
#select 
#Select variables of concern
#############################
customer<-CustomerData
####
#SELECTING VARIABLES
###

#Select one or more variables
select(flights, year, month, day) 

#Same as:
select(flights, year:day)

####
#SELECTING VARIABLES
####
#Deselect one or more variables
select(flights, -(year:day)) 

####
#USEFUL select FUNCTIONS
####
# - Select everything but 
# : Select range 
# contains() Select columns whose name contains a character string 
# ends_with() Select columns whose name ends with a string 
# everything() Select every column 
# matches() Select columns whose name matches a regular expression 
# num_range() Select columns named x1, x2, x3, x4, x5 
# one_of() Select columns whose names are in a group of names 
# starts_with() Select columns whose name starts with a character string 

select(flights, ends_with("time")) 
select(flights, c(carrier, ends_with("time"), contains("delay"))) 

####
#VARIABLE PLACEMENT
####

select(flights, time_hour, air_time, everything()) 

####
#RENAMING VARIABLES
####
rename(flights, ANNOYING = dep_delay) 

####
#YOUR TURN! & SOLUTION
####
# 1. select all variables between CustomerID and Gender 
select(customer, CustomerID:Gender)
# 2. select all variables except for those between CustomerID and Gender 
select(customer, -(CustomerID:Gender))
#3. select CustomerID and all variables that contain the word “Card” 
select(customer, CustomerID, contains("Card"))

ARRANGE:

###############
#arrange 
#Reorder data
###############
customer<-CustomerData
####
#ORDERING YOUR DATA
####

#Order data based on one or more variables
arrange(flights, dep_delay) 
arrange(flights, dep_delay, arr_delay) 

#Reverse the order by using desc() 
arrange(flights, desc(dep_delay)) 

#Note that missing values are always sorted at the end:
df <- tibble(x = c(5, 2, 5, NA)) 
arrange(df, x) 
arrange(df, desc(x)) 

####
#YOUR TURN! & SOLUTION
####
# 1: select variables 
sub_cust <- select(customer, CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth)
# 2: Order sub_cust data by Age and CardSpendMonth (ascending order) 
arrange(customer, Age, CardSpendMonth)
# 3: Order sub_cust data by Age (oldest to youngest) and CardSpendMonth (least to most) 
arrange(customer, desc(Age), CardSpendMonth)

MUTATE:

##########################################################
#mutate 
#Create new variables with functions of existing variables
##########################################################

####
#REDUCE OUR DATA
####
#Lets work with a smaller data set
flights_sml <- select(flights,   year:day,   ends_with("delay"),   distance,   air_time )
flights_sml 


####
#CREATE NEW VARIABLES
####

#mutate() creates new variables with functions of existing variables:
mutate(flights_sml,  gain = arr_delay - dep_delay,  speed = distance / air_time * 60 ) 


#Note: you can create variables based on columns that you’ve just created: 
mutate(flights_sml,  gain = arr_delay - dep_delay,  hours = air_time / 60,  gain_per_hour = gain / hours ) 

#If you only want to keep the new variables use transmute():
transmute(flights,  gain = arr_delay - dep_delay,  hours = air_time / 60,  gain_per_hour = gain / hours ) 



####
#MANY USEFUL CREATION FUNCTIONS:
###
#There are a wide variety of functions you can use with mutate()
# +, -, *, /, ^ arithmetic 
# x / sum(x) arithmetic w/aggregate functions 
# %/%, %% modular arithmetic 
# log, exp, sqrt transformations 
# lag, lead offsets 
# cumsum, cumprod, cum… cum/rolling aggregates 
# >, >=, <, <=, 
# !=, == logical comparisons 
# min_rank, dense_rank, etc ranking 
# between are values between a and b? 
# ntile bin values into buckets


transmute(flights,  normalized_delay = dep_delay / mean(dep_delay, na.rm = TRUE)) 

transmute(flights,  log_air_time = log2(air_time),  exp_delay = exp(dep_delay)) 

transmute(flights,  dep_delay = dep_delay,  lag_delay = lag(dep_delay),  sum_delay = cumsum(dep_delay)) 


####
#YOUR TURN! & SOLUTION
####
#1: create a ratio variable that computes the ratio of CardSpendMonth to HHIncome 
mutate(sub_cust, ratio = CardSpendMonth / HHIncome)

#2: create 2 variables: #        
#i) ratio1 = CardSpendMonth / HHIncome        
#ii) ratio2 = CardSpendMonth / Age 
mutate(sub_cust,   ratio1 = CardSpendMonth / HHIncome,  ratio2 = CardSpendMonth / Age  )

SUMMARISE:

#########################################################
#summarise
#Collapse many values down to a single summary statistic
#########################################################

####
#SUMMARIZING OUR DATA
####

#We can create summary statistics of one or more variables:
summarise(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE)) 

#We can create summary statistics of one or more variables:
summarise(flights,   dep_delay_mean = mean(dep_delay, na.rm = TRUE),  dep_delay_sd = sd(dep_delay, na.rm = TRUE) )

summarise(flights,   dep_delay_mean = mean(dep_delay, na.rm = TRUE),  dep_delay_sd = sd(dep_delay, na.rm = TRUE),  n = n() ) 

####
#SUMMARY FUNCTIONS
####
#* All take a vector of values and return a single value 
#** Blue functions come in dplyr
# min(), max() Minimum and maximum values 
# mean() Mean value 
# median() Median value 
# sum() Sum of values 
# var, sd() Variance and standard deviation of a vector 
# first() First value in a vector last() Last value in a vector 
# nth() Nth value in a vector 
# n() The number of values in a vector 
# n_distinct() The number of distinct values in a vector

####
#SUMMARIZING GROUPED DATA
####

#Summary statistics become more powerful when we can compare groups:
by_day <- group_by(flights, year, month, day) 
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE)) 

####
#YOUR TURN! & SOLUTION
####

#1: Avg spend across all customers 
summarize(sub_cust, Avg_spend = mean(CardSpendMonth, na.rm = TRUE) )
#2: Now compute the average CardSpendMonth for each gender. 
by_gender <- group_by(sub_cust, Gender) summarize(by_gender, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
#3: Now compute the average CardSpendMonth for each gender and region.  #   Which gender and region have the highest average spend? 
by_gdr_rgn <- group_by(sub_cust, Gender, Region) 
avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) 
arrange(avg_gdr_rgn, desc(Avg_spend))

PIPE OPERATOR:

####################################################
#pipe operator 
#Chaining functions together with the pipe operator
####################################################

####
#STREAMLINING OUR ANALYSIS
####

sub_cust <- select(customer, CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth)

# Going back to our last problem, our code was doing three things: 
# 1. grouping by gender and region 
# 2. summarizing average spend 
# 3. sorting spend by greatest to least

by_gdr_rgn <- group_by(sub_cust, Gender, Region)
avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
arrange(avg_gdr_rgn, desc(Avg_spend))

#We can streamline our code to make it more efﬁcient and legible
library(magrittr) 
x <- 1:15 
sum(x) 
x %>% sum()

#• Lets re-write our code using the pipe (%>%) operator 
#• This code does four things in a very efﬁcient & readable manner

sub_cust %>%  
  group_by(Gender, Region) %>%  
  summarize(Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) %>%  
  arrange(desc(Avg_spend))

####
#YOUR TURN! & SOLUTION
####

#Using the pipe operator follow these steps with the sub_cust data: 
#1. ﬁlter for male customers only 
#2. create a new variable: ratio = CardSpendMonth / HHIncome 
#3. group this data by age 
#4. compute the mean of the new ratio variable by age 
#5. sort this output to ﬁnd the age with the highest ratio of expenditures to income.

sub_cust %>%  
filter(Gender == "Male") %>%  
mutate(ratio = CardSpendMonth / HHIncome) %>%  
group_by(Age) %>%  
summarize(Avg_ratio = mean(ratio, na.rm = TRUE)) %>%  
arrange(desc(Avg_ratio))

Previous-02-data preparation Next-04-visualization

Last updated 6 years ago