# -03-data transformation

Link to the module:

[Module03-data-transformation](https://github.com/itsmecevi/r-bootcamp-module-03/blob/master/03-transformation.pdf)

{% file src="<https://4050850663-files.gitbook.io/~/files/v0/b/gitbook-legacy-files/o/assets%2F-LJ9jSFe6fncOCaGtdZ3%2F-LNZWCqE685oiACf2ql_%2F-LNZXUqn079X7Nk2IqFq%2FCustomerData.csv?alt=media&token=f9986082-327a-48fa-9818-cc24f21953d5>" %}
CustomerData
{% endfile %}

![](https://4050850663-files.gitbook.io/~/files/v0/b/gitbook-legacy-files/o/assets%2F-LJ9jSFe6fncOCaGtdZ3%2F-LNVGdAn5iZ_3gpqyTNJ%2F-LNVGw8S_BojOY0os1cm%2Fdata-transformation.PNG?alt=media\&token=708a3ed0-507e-4809-8586-2161d4dbf615)

![](https://4050850663-files.gitbook.io/~/files/v0/b/gitbook-legacy-files/o/assets%2F-LJ9jSFe6fncOCaGtdZ3%2F-LNVX0BbZLhhczfIlmFu%2F-LNVX3pKKXpQ0ODNzh5r%2Fdplyr.PNG?alt=media\&token=aeb9550b-17e5-40d4-b5c2-8b6a0495c95d)

More about dplyr: <https://dplyr.tidyverse.org/>

dplyr cheatseat from RStudio: <https://www.rstudio.com/resources/cheatsheets/#dplyr>

![](https://4050850663-files.gitbook.io/~/files/v0/b/gitbook-legacy-files/o/assets%2F-LJ9jSFe6fncOCaGtdZ3%2F-LNVX0BbZLhhczfIlmFu%2F-LNVXH8PurJn1GVqfECP%2Fbasics.PNG?alt=media\&token=38ba0855-6065-4f6b-9f39-a185ba6c7a0d)

```r
library(nycflights13)
library(tidyverse)
############
#YOUR TURN!
############

#Are there vignettes for the dplyr package?
#Can you find additional documentation explaining the
#flights data set?

#SOLUTION
# are there vignettes for the dplyr package —> yes, 8 of them
vignette(package = "dplyr")
# additional documentation for the mpg data set
?flights
```

## FILTER:

```r
###############################################
#filter
#Filter values based on defined conditions
###############################################

#Filter based on one or more variables
View(flights)
filter(flights, month == 1)
View(filter(flights, month == 1))

filter(flights, month == 1, day == 1)
View(filter(flights, month == 1, day == 1))

filter(flights, month == 1, day == 1, dep_delay > 0)
View(filter(flights, month == 1, day == 1, dep_delay > 0))

#SAVE NEW DATA FRAME
#• Save filter data frame using assignment operator (<-)
dec25 <- filter(flights, month == 12, day == 25)
dec25

####
#LOGICAL TESTS
####
12 == 12
#[1] TRUE
12 <= c(12, 11)
#[1] TRUE FALSE
12 %in% c(12, 11, 8) #Group membership
#[1] TRUE
x <- c(12, NA, 11, NA, 8)
is.na(x)
#[1] FALSE TRUE FALSE TRUE FALSE

##?Comparison##
# < Less than
# > Greater than
# == Equal to
# <= Less than or equal to
# >= Greater than or equal to
# != Not equal to
# %in% Group membership
# is.na Is NA
# !is.na Is not NA

####
#COMPARISON
####
#What will these operations produce?
filter(flights, month == 12)
filter(flights, month != 12)
filter(flights, month %in% c(11, 12))
filter(flights, arr_delay <= 120)
filter(flights, !(arr_delay <= 120))
filter(flights, is.na(tailnum))

####
#MULTIPLE LOGICAL TESTS
####
12 == 12 & 12 < 14 
#[1] TRUE
12 == 12 & 12 < 10
#[1] FALSE
12 == 12 | 12 < 10
#[1] TRUE
any(12 == 12, 12 < 10)
#[1] TRUE
all(12 == 12, 12 < 10)
#[1] FALSE

##?base::Logic##
# & boolean and
# | boolean or
# xor exclusively x or y
# ! not
# any any true :at least one is TRUE
# all all true

#####
#MULTIPLE COMPARISONS 
####
#Using comma is same as using &
filter(flights, month == 12, day == 25)
filter(flights, month == 12 & day == 25)

#Use %in% as a shortcut for |
filter(flights, month == 11 | month == 12)
filter(flights, month %in% c(11, 12))

#Are these the same????
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)

####
#YOUR TURN & SOLUTION
####
# 1: import the data
customer <- read_csv("data/CustomerData.csv")
#Import the data with IDE RStudio
customer<-CustomerData
# 2: filter for female customers only
filter(customer, Gender == "Female")
# 3: filter for female customers that are greater than 45 years old and live in region 3
filter(customer, Gender == "Female", Age > 45, Region == 3)
# 4: filter for female customers that are greater than 45 years old or live in region 3
filter(customer, Gender == "Female", Age > 45 | Region == 3)
```

## **SELECT**:

```r
#############################
#select 
#Select variables of concern
#############################
customer<-CustomerData
####
#SELECTING VARIABLES
###

#Select one or more variables
select(flights, year, month, day) 

#Same as:
select(flights, year:day)

####
#SELECTING VARIABLES
####
#Deselect one or more variables
select(flights, -(year:day)) 

####
#USEFUL select FUNCTIONS
####
# - Select everything but 
# : Select range 
# contains() Select columns whose name contains a character string 
# ends_with() Select columns whose name ends with a string 
# everything() Select every column 
# matches() Select columns whose name matches a regular expression 
# num_range() Select columns named x1, x2, x3, x4, x5 
# one_of() Select columns whose names are in a group of names 
# starts_with() Select columns whose name starts with a character string 

select(flights, ends_with("time")) 
select(flights, c(carrier, ends_with("time"), contains("delay"))) 

####
#VARIABLE PLACEMENT
####

select(flights, time_hour, air_time, everything()) 

####
#RENAMING VARIABLES
####
rename(flights, ANNOYING = dep_delay) 

####
#YOUR TURN! & SOLUTION
####
# 1. select all variables between CustomerID and Gender 
select(customer, CustomerID:Gender)
# 2. select all variables except for those between CustomerID and Gender 
select(customer, -(CustomerID:Gender))
#3. select CustomerID and all variables that contain the word “Card” 
select(customer, CustomerID, contains("Card"))
```

## ARRANGE:

```r
###############
#arrange 
#Reorder data
###############
customer<-CustomerData
####
#ORDERING YOUR DATA
####

#Order data based on one or more variables
arrange(flights, dep_delay) 
arrange(flights, dep_delay, arr_delay) 

#Reverse the order by using desc() 
arrange(flights, desc(dep_delay)) 

#Note that missing values are always sorted at the end:
df <- tibble(x = c(5, 2, 5, NA)) 
arrange(df, x) 
arrange(df, desc(x)) 

####
#YOUR TURN! & SOLUTION
####
# 1: select variables 
sub_cust <- select(customer, CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth)
# 2: Order sub_cust data by Age and CardSpendMonth (ascending order) 
arrange(customer, Age, CardSpendMonth)
# 3: Order sub_cust data by Age (oldest to youngest) and CardSpendMonth (least to most) 
arrange(customer, desc(Age), CardSpendMonth)
```

## MUTATE:

```r
##########################################################
#mutate 
#Create new variables with functions of existing variables
##########################################################

####
#REDUCE OUR DATA
####
#Lets work with a smaller data set
flights_sml <- select(flights,   year:day,   ends_with("delay"),   distance,   air_time )
flights_sml 


####
#CREATE NEW VARIABLES
####

#mutate() creates new variables with functions of existing variables:
mutate(flights_sml,  gain = arr_delay - dep_delay,  speed = distance / air_time * 60 ) 


#Note: you can create variables based on columns that you’ve just created: 
mutate(flights_sml,  gain = arr_delay - dep_delay,  hours = air_time / 60,  gain_per_hour = gain / hours ) 

#If you only want to keep the new variables use transmute():
transmute(flights,  gain = arr_delay - dep_delay,  hours = air_time / 60,  gain_per_hour = gain / hours ) 



####
#MANY USEFUL CREATION FUNCTIONS:
###
#There are a wide variety of functions you can use with mutate()
# +, -, *, /, ^ arithmetic 
# x / sum(x) arithmetic w/aggregate functions 
# %/%, %% modular arithmetic 
# log, exp, sqrt transformations 
# lag, lead offsets 
# cumsum, cumprod, cum… cum/rolling aggregates 
# >, >=, <, <=, 
# !=, == logical comparisons 
# min_rank, dense_rank, etc ranking 
# between are values between a and b? 
# ntile bin values into buckets


transmute(flights,  normalized_delay = dep_delay / mean(dep_delay, na.rm = TRUE)) 

transmute(flights,  log_air_time = log2(air_time),  exp_delay = exp(dep_delay)) 

transmute(flights,  dep_delay = dep_delay,  lag_delay = lag(dep_delay),  sum_delay = cumsum(dep_delay)) 


####
#YOUR TURN! & SOLUTION
####
#1: create a ratio variable that computes the ratio of CardSpendMonth to HHIncome 
mutate(sub_cust, ratio = CardSpendMonth / HHIncome)

#2: create 2 variables: #        
#i) ratio1 = CardSpendMonth / HHIncome        
#ii) ratio2 = CardSpendMonth / Age 
mutate(sub_cust,   ratio1 = CardSpendMonth / HHIncome,  ratio2 = CardSpendMonth / Age  )


```

## SUMMARISE:

```r
#########################################################
#summarise
#Collapse many values down to a single summary statistic
#########################################################

####
#SUMMARIZING OUR DATA
####

#We can create summary statistics of one or more variables:
summarise(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE)) 

#We can create summary statistics of one or more variables:
summarise(flights,   dep_delay_mean = mean(dep_delay, na.rm = TRUE),  dep_delay_sd = sd(dep_delay, na.rm = TRUE) )

summarise(flights,   dep_delay_mean = mean(dep_delay, na.rm = TRUE),  dep_delay_sd = sd(dep_delay, na.rm = TRUE),  n = n() ) 

####
#SUMMARY FUNCTIONS
####
#* All take a vector of values and return a single value 
#** Blue functions come in dplyr
# min(), max() Minimum and maximum values 
# mean() Mean value 
# median() Median value 
# sum() Sum of values 
# var, sd() Variance and standard deviation of a vector 
# first() First value in a vector last() Last value in a vector 
# nth() Nth value in a vector 
# n() The number of values in a vector 
# n_distinct() The number of distinct values in a vector

####
#SUMMARIZING GROUPED DATA
####

#Summary statistics become more powerful when we can compare groups:
by_day <- group_by(flights, year, month, day) 
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE)) 

####
#YOUR TURN! & SOLUTION
####

#1: Avg spend across all customers 
summarize(sub_cust, Avg_spend = mean(CardSpendMonth, na.rm = TRUE) )
#2: Now compute the average CardSpendMonth for each gender. 
by_gender <- group_by(sub_cust, Gender) summarize(by_gender, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
#3: Now compute the average CardSpendMonth for each gender and region.  #   Which gender and region have the highest average spend? 
by_gdr_rgn <- group_by(sub_cust, Gender, Region) 
avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) 
arrange(avg_gdr_rgn, desc(Avg_spend))
```

## PIPE OPERATOR:

```r
####################################################
#pipe operator 
#Chaining functions together with the pipe operator
####################################################

####
#STREAMLINING OUR ANALYSIS
####

sub_cust <- select(customer, CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth)

# Going back to our last problem, our code was doing three things: 
# 1. grouping by gender and region 
# 2. summarizing average spend 
# 3. sorting spend by greatest to least

by_gdr_rgn <- group_by(sub_cust, Gender, Region)
avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
arrange(avg_gdr_rgn, desc(Avg_spend))

#We can streamline our code to make it more efﬁcient and legible
library(magrittr) 
x <- 1:15 
sum(x) 
x %>% sum()

#• Lets re-write our code using the pipe (%>%) operator 
#• This code does four things in a very efﬁcient & readable manner

sub_cust %>%  
  group_by(Gender, Region) %>%  
  summarize(Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) %>%  
  arrange(desc(Avg_spend))

####
#YOUR TURN! & SOLUTION
####

#Using the pipe operator follow these steps with the sub_cust data: 
#1. ﬁlter for male customers only 
#2. create a new variable: ratio = CardSpendMonth / HHIncome 
#3. group this data by age 
#4. compute the mean of the new ratio variable by age 
#5. sort this output to ﬁnd the age with the highest ratio of expenditures to income.

sub_cust %>%  
filter(Gender == "Male") %>%  
mutate(ratio = CardSpendMonth / HHIncome) %>%  
group_by(Age) %>%  
summarize(Avg_ratio = mean(ratio, na.rm = TRUE)) %>%  
arrange(desc(Avg_ratio))
```


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://r-pedia.gitbook.io/cevi/intro-to-r-bootcamp/03-data-transformation.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
