--5-Utilities

#1-Useful Functions

# mean()
# abs()
# round()
# sum()
# seq()
# rep()
# is.*()---> output: TRUE or FALSE
# as.*()--> define the object of variable
# append()
# rev()

#2-Mathematical utilities

# Have another look at some useful math functions that R features: 
# abs(): Calculate the absolute value.
# sum(): Calculate the sum of all the values in a data structure.
# mean(): Calculate the arithmetic mean.
# round(): Round the values to 0 decimal places by default. 
#Try out ?round in the console for variations of round() and
# ways to change the number of digits to round to.

#Calculate the sum of the absolute rounded values of the training errors. 
#You can work in parts, or with a single one-liner. 
#There's no need to store the result in a variable, just have R print it.

# The errors vector has already been defined for you
errors <- c(1.9, -2.6, 4.0, -9.5, -3.4, 7.3)

# Sum of absolute rounded values of errors
sum(abs(round(errors)))

#3-Find the error

#Fix the error by including code on the last line. 
#Remember: you want to call mean() only once!

# Don't edit these two lines
vec1 <- c(1.5, 2.5, 8.4, 3.7, 6.3)
vec2 <- rev(vec1)
vec1
vec2

# Fix the error
mean(c(abs(vec1),abs(vec2)))

#4-Data Utilities

#R features a bunch of functions to juggle around with data structures::
  
# seq(): Generate sequences, by specifying the from, to, and by arguments.
# rep(): Replicate elements of vectors and lists.
# sort(): Sort a vector in ascending order. Works on numerics, 
#but also on character strings and logicals.
# rev(): Reverse the elements in a data structures for which reversal is defined.
# str(): Display the structure of any R object.
# append(): Merge vectors or lists.
# is.*(): Check for the class of an R object.
# as.*(): Convert an R object from one class to another.
# unlist(): Flatten (possibly embedded) lists to produce a vector.

#Convert both linkedin and facebook lists to a vector, and 
#store them as li_vec and fb_vec respectively.
#Next, append fb_vec to the li_vec (Facebook data comes last). 
#Save the result as social_vec.
#Finally, sort social_vec from high to low. Print the resulting vector.

# The linkedin and facebook lists have already been created for you
linkedin <- list(16, 9, 13, 5, 2, 17, 14)
facebook <- list(17, 7, 5, 16, 8, 13, 14)

# Convert linkedin and facebook to a vector: li_vec and fb_vec
li_vec<-as.vector(linkedin)
fb_vec<-as.vector(facebook)

# Append fb_vec to li_vec: social_vec
social_vec<-append(li_vec,fb_vec)
social_vec
# Sort social_vec
sort(unlist(social_vec),decreasing=TRUE)

#5-Find the error (2)

#Correct the expression. Make sure that your fix still uses the functions rep() 
#and seq().

# Fix me
rep(seq(1, 7, by = 2), times = 7)

#6-Beat Gauss using R

#Using the function seq(), create a sequence that ranges from 1 to 500 
#in increments of 3. Assign the resulting vector to a variable seq1.
#Again with the function seq(), create a sequence that ranges from 1200 to 900 
#in increments of -7. Assign it to a variable seq2.
#Calculate the total sum of the sequences, either by using the sum() 
#function twice and adding the two results, or 
#by first concatenating the sequences and then using the sum() function once. 
#Print the result to the console.

# Create first sequence: seq1
seq1<-seq(1,500,by=3)
seq1

# Create second sequence: seq2
seq2<-seq(1200,900,by=-7)
seq2

# Calculate total sum of the sequences
sum(seq1)+sum(seq2)

#7-Regular Expressions

# -Sequence of meta characters
# -Pattern existence
# -Pattern replacement
# -Pattern extraction
# -grepl()->output: TRUE FALSE
# grep()-> Position of the vector
# sub()
# gsub()

#8-grepl & grep

#Use grepl() to generate a vector of logicals that indicates 
#whether these email addressess contain "edu". 
#Print the result to the output.
#Do the same thing with grep(), but this time save the resulting indexes
#in a variable hits.
#Use the variable hits to select from the emails vector only the emails that 
#contain "edu".

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use grepl() to match for "edu"
grepl(pattern="edu",x=emails)

# Use grep() to match for "edu", save result to hits
hits<-grep(pattern="edu",x=emails)
hits
# Subset emails using hits
emails[hits]

#9-grepl & grep (2)

#Use grepl() with the more advanced regular expression to return a logical vector. 
#Simply print the result.
#Do a similar thing with grep() to create a vector of indices. 
#Store the result in the variable hits.
#Use emails[hits] again to subset the emails vector.

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use grepl() to match for .edu addresses more robustly
grepl(pattern="@.*\\.edu$",x=emails)

# Use grep() to match for .edu addresses more robustly, save result to hits
hits<-grep(pattern="@.*\\.edu$",x=emails)
hits

# Subset emails using hits
emails[hits]

#10-sub & gsub

#With the advanced regular expression "@.*\\.edu$", use sub() to replace 
#the match with "@datacamp.edu". 
#Since there will only be one match per character string, 
#gsub() is not necessary here. 
#Inspect the resulting output.

# The emails vector has already been defined for you
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "global@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use sub() to convert the email domains to datacamp.edu
sub(pattern="@.*\\.edu$",replacement="@datacamp.edu",x=emails)

#11-sub & gsub (2)

awards <- c("Won 1 Oscar.",
            "Won 1 Oscar. Another 9 wins & 24 nominations.",
            "1 win and 2 nominations.",
            "2 wins & 3 nominations.",
            "Nominated for 2 Golden Globes. 1 more win & 2 nominations.",
            "4 wins & 1 nomination.")

sub(".*\\s([0-9]+)\\snomination.*$", "\\1", awards)

#What does this code chunk return? awards is already defined in the workspace 
#so you can start playing in the console straight away.
#A vector of integers containing: 1, 24, 2, 3, 2, 1.
#The vector awards gets returned as there isn't a single element in 
#awards that matches the regular expression.
#A vector of character strings containing "1", "24", "2", "3", "2", "1".
#A vector of character strings containing 
#"Won 1 Oscar.", "24", "2", "3", "2", "1".->The right answer

#12-Times and Dates

# as.Date-> time difference is days
# as. Posixct-> time difference is second by default, 
#but if the result is large then Posixct come with simple readable time unit
# Dedicated packages to deal with time unit: lubridate,zoo, xts

#13-Right here, right now

# Ask R for the current date, and store the result in a variable today.
# To see what today looks like under the hood, call unclass() on it.
# Ask R for the current time, and store the result in a variable, now.
# To see the numerical value that corresponds to now, call unclass() on it.

# Get the current date: today
today<-Sys.Date()

# See what today looks like under the hood
unclass(today)

# Get the current time: now
now<-Sys.time()

# See what now looks like under the hood
unclass(now)

#14-Create and format dates

#In the editor on the right, three character strings 
#representing dates have been created. 
#Convert them to dates using as.Date(), and assign them to date1, date2, and 
#date3 respectively. 
#The code for date1 is already included.
#Extract useful information from the dates as character strings using format(). 
#From the first date, select the weekday. From the second date, 
#select the day of the month. 
#From the third date, you should select the abbreviated month and 
#the 4-digit year, separated by a space.

# Definition of character strings representing dates
str1 <- "May 23, '96"
str2 <- "2012-03-15"
str3 <- "30/January/2006"

# Convert the strings to dates: date1, date2, date3
date1 <- as.Date(str1, format = "%b %d, '%y")
date2 <- as.Date(str2)
date3 <- as.Date(str3, format = "%d/%B/%Y")

# Convert dates to formatted strings
format(date1, "%A")
format(date2, "%d")
format(date3,"%b %Y")

#15-Create and format times

#Convert two strings that represent timestamps, str1 and str2, 
#to POSIXct objects called time1 and time2.
#Using format(), create a string from time1 containing only the minutes.
#From time2, extract the hours and minutes as "hours:minutes AM/PM". 
#Refer to the assignment text above to find the correct conversion symbols!

# Definition of character strings representing times
str1 <- "May 23, '96 hours:23 minutes:01 seconds:45"
str2 <- "2012-3-12 14:23:08"

# Convert the strings to POSIXct objects: time1, time2
time1 <- as.POSIXct(str1, format = "%B %d, '%y hours:%H minutes:%M seconds:%S")
time2<-as.POSIXct(str2)

# Convert times to formatted strings
format(time1,"%M" )
format(time2,"%I:%M %p")

#16-Calculations with Dates

#Calculate the number of days that passed between the last and 
#the first day you ate pizza. Print the result.
#Use the function diff() on pizza to calculate the differences between 
#consecutive pizza days. Store the result in a new variable day_diff.
#Calculate the average period between two consecutive pizza days. Print the result.

# day1, day2, day3, day4 and day5 are already available in the workspace
day1<-as.Date("2017-11-09")
day2<-as.Date("2017-11-11")
day3<-as.Date("2017-11-16")
day4<-as.Date("2017-11-22")
day5<-as.Date("2017-11-27")

# Difference between last and first pizza day
day5-day1
print(day5-day1)

# Create vector pizza
pizza <- c(day1, day2, day3, day4, day5)

# Create differences between consecutive pizza days: day_diff
day_diff<-diff(pizza)

# Average period between two consecutive pizza days
print(mean(day_diff))

#17-Calculations with Times

#Calculate the difference between the two vectors logout and login, 
#i.e. the time the user was online in each independent session. 
#Store the result in a variable time_online.
#Inspect the variable time_online by printing it.
#Calculate the total time that the user was online. Print the result.
#Calculate the average time the user was online. Print the result.

login<-as.POSIXct(c("2017-11-13 10:18:04 UTC" ,"2017-11-18 09:14:18 UTC",
"2017-11-18 12:21:51 UTC", "2017-11-18 12:37:24 UTC",
"2017-11-20 21:37:55 UTC"))

logout<-as.POSIXct(c("2017-11-13 10:56:29 UTC", "2017-11-18 09:14:52 UTC",
                     "2017-11-18 12:35:48 UTC", "2017-11-18 13:17:22 UTC",
                      "2017-11-20 22:08:47 UTC"))


# login and logout are already defined in the workspace
login
as.vectorlogout
class(login)
class(logout)
# Calculate the difference between login and logout: time_online
time_online<-logout-login

# Inspect the variable time_online
time_online

# Calculate the total time online
sum(time_online)

# Calculate the average time online
mean(time_online)

#18-Time is of the essence

# Use as.Date() to convert the astro vector to a vector containing Date objects. 
# You will need the %d, %b and %Y symbols to specify the format. 
#Store the resulting vector as astro_dates.
# Use as.Date() to convert the meteo vector to a vector with Date objects. 
# This time, you will need the %B, %d and %y symbols for the format argument. 
# Store the resulting vector as meteo_dates.
# With a combination of max(), abs() and -, 
#calculate the maximum absolute difference between the astronomical and 
#the meteorological beginnings of a season, i.e. astro_dates and meteo_dates. 
#Simply print this maximum difference to the console output.

astro<-c("20-Mar-2015", "25-Jun-2015", "23-Sep-2015", "22-Dec-2015")
meteo<-c("March 1, 15"   ,   "June 1, 15" ,"September 1, 15" , "December 1, 15")
astro
meteo

# Convert astro to vector of Date objects: astro_dates
astro_dates<-as.Date(astro,format="%d-%b-%Y")
astro_dates
class(astro_dates)

# Convert meteo to vector of Date objects: meteo_dates
meteo_dates<-as.Date(meteo,format="%B %d, %y")
meteo_dates
# Calculate the maximum absolute difference between astro_dates and meteo_dates
print(max(abs(astro_dates-meteo_dates)))

Last updated