-Dealing with String
Handling, cleaning and processing character strings is becoming a prerequisite in daily data analysis.
#Character string basics
Creating Strings:
x <- "Berlin"
y <- "Paris"
#The paste() function is expert for creating text
# paste together string a & b
paste(a, b)
## [1] "learning to create character strings"
# paste character and number strings
# (converts numbers to character class)
paste("The life of", pi)
## [1] "The life of 3.14159265358979"
# paste multiple strings
paste("I", "love", "R")
## [1] "I love R"
# paste multiple strings with a separating character
paste("I", "love", "R", sep = "-")
## [1] "I-love-R"
# use paste0() to paste without spaces btwn characters
paste0("I", "love", "R")
## [1] "IloveR"
# paste objects with different lengths
paste("R", 1:5, sep = " v1.")
## [1] "R v1.1" "R v1.2" "R v1.3" "R v1.4" "R v1.5"
Converting to Strings:
a <- "The life of"
b <- pi
is.character(a)
## [1] TRUE
is.character(b)
## [1] FALSE
c <- as.character(b)
is.character(c)
## [1] TRUE
toString(c("Aug", 24, 1980))
## [1] "Aug, 24, 1980"
Printing Strings:
print()
: generic printingnoquote()
: print with no quotescat()
: concatenate and print with no quotessprintf()
: a wrapper for the C functionsprintf
, that returns a character vector containing a formatted combination of text and variable values
x <- "learning to print strings"
# basic printing
print(x)
## [1] "learning to print strings"
# print without quotes
print(x, quote = FALSE)
## [1] learning to print strings
noquote(x)
## [1] learning to print strings
#Alternative:
# basic printing (similar to noquote)
cat(x)
## learning to print strings
# combining character strings
cat(x, "in R")
## learning to print strings in R
# basic printing of alphabet
cat(letters)
## a b c d e f g h i j k l m n o p q r s t u v w x y z
# specify a seperator between the combined characters
cat(letters, sep = "-")
## a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z
# collapse the space between the combine characters
cat(letters, sep = "")
## abcdefghijklmnopqrstuvwxyz
x <- "Today I am learning how to print strings."
y <- "Tomorrow I plan to learn about textual analysis."
z <- "The day after I will take a break and drink a beer."
cat(x, y, z, fill = 0)
## Today I am learning how to print strings. ->
# Tomorrow I plan to learn about textual analysis. ->
# The day after I will take a break and drink a beer.
cat(x, y, z, fill = 5)
## Today I am learning how to print strings.
## Tomorrow I plan to learn about textual analysis.
## The day after I will take a break and drink a beer.
To substitute in a string or string variable, use %s
:
x <- "print strings"
# substitute a single string/variable
sprintf("Learning to %s in R", x)
## [1] "Learning to print strings in R"
# substitute multiple strings/variables
y <- "in R"
sprintf("Learning to %s %s", x, y)
## [1] "Learning to print strings in R"
For integers, use %d
or a variant:
version <- 3
# substitute integer
sprintf("This is R version:%d", version)
## [1] "This is R version:3"
# print with leading spaces
sprintf("This is R version:%4d", version)
## [1] "This is R version: 3"
# can also lead with zeros
sprintf("This is R version:%04d", version)
## [1] "This is R version:0003"
For floating-point numbers, use %f
for standard notation, and %e
or %E
for exponential notation:
sprintf("%f", pi) # '%f' indicates 'fixed point' decimal
#notation
## [1] "3.141593"
sprintf("%.3f", pi) # decimal notation with 3 decimal digits
## [1] "3.142"
sprintf("%1.0f", pi) # 1 integer and 0 decimal digits
## [1] "3"
sprintf("%5.1f", pi) # decimal notation with 5 total
#decimal digits
## [1] " 3.1" # only 1 to the right of the decimal point
sprintf("%05.1f", pi) # same as above but fill empty digits
#with zeros
## [1] "003.1"
sprintf("%+f", pi) # print with sign (positive)
## [1] "+3.141593"
sprintf("% f", pi) # prefix a space
## [1] " 3.141593"
sprintf("%e", pi) # exponential decimal notation 'e'
## [1] "3.141593e+00"
sprintf("%E", pi) # exponential decimal notation 'E'
## [1] "3.141593E+00"
Counting string elements and characters:
length("How many elements are in this string?")
## [1] 1
length(c("How", "many", "elements", "are", "in", "this", "string?"))
## [1] 7
nchar("How many characters are in this string?")
## [1] 39
nchar(c("How", "many", "characters", "are", "in", "this", "string?"))
## [1] 3 4 10 3 2 4 7
#String manipulation with base R
Case conversion:
x <- "Learning To MANIPULATE strinGS in R"
tolower(x)
## [1] "learning to manipulate strings in r"
toupper(x)
## [1] "LEARNING TO MANIPULATE STRINGS IN R"
Simple Character Replacement:
# replace 'A' with 'a'
x <- "This is A string."
chartr(old = "A", new = "a", x)
## [1] "This is a string."
# multiple character replacements
# replace any 'd' with 't' and any 'z' with 'a'
y <- "Tomorrow I plzn do lezrn zbout dexduzl znzlysis."
chartr(old = "dz", new = "ta", y)
## [1] "Tomorrow I plan to learn about textual analysis."
String Abbreviations:
streets <- c("Main", "Elm", "Riverbend", "Mario", "Frederick")
# default abbreviations
abbreviate(streets)
## Main Elm Riverbend Mario Frederick
## "Main" "Elm" "Rvrb" "Mari" "Frdr"
# set minimum length of abbreviation
abbreviate(streets, minlength = 2)
## Main Elm Riverbend Mario Frederick
## "Mn" "El" "Rv" "Mr" "Fr"
Extract/Replace Substrings:
alphabet <- paste(LETTERS, collapse = "")
# extract 18th character in string
substr(alphabet, start = 18, stop = 18)
## [1] "R"
# extract 18-24th characters in string
substr(alphabet, start = 18, stop = 24)
## [1] "RSTUVWX"
# replace 1st-17th characters with `R`
substr(alphabet, start = 19, stop = 24) <- "RRRRRR"
alphabet
## [1] "ABCDEFGHIJKLMNOPQRRRRRRRYZ"
alphabet <- paste(LETTERS, collapse = "")
# extract 18th through last character
substring(alphabet, first = 18)
## [1] "RSTUVWXYZ"
# recursive extraction; specify start position only
substring(alphabet, first = 18:24)
## [1] "RSTUVWXYZ" "STUVWXYZ"
# "TUVWXYZ" "UVWXYZ" "VWXYZ" "WXYZ"
## [7] "XYZ"
# recursive extraction; specify start and stop positions
substring(alphabet, first = 1:5, last = 3:7)
## [1] "ABC" "BCD" "CDE" "DEF" "EFG"
z <- "The day after I will take a break and drink a beer."
strsplit(z, split = " ")
## [[1]]
## [1] "The" "day" "after" "I" "will" "take" "a" "break"
## [9] "and" "drink" "a" "beer."
a <- "Alabama-Alaska-Arizona-Arkansas-California"
strsplit(a, split = "-")
## [[1]]
## [1] "Alabama" "Alaska" "Arizona" "Arkansas" "California"
unlist(strsplit(a, split = "-"))
## [1] "Alabama" "Alaska" "Arizona" "Arkansas" "California"
#String manipulation with stringr
The stringr
package is the Winner problem solving for string manipulation. Developer by Hadley Wickham
# install stringr package
install.packages("stringr")
# load package
library(stringr)
Basic Operations:
Three string functions that are closely related to their base R equivalents:
Concatenate with
str_c()
Number of characters with
str_length()
Substring with
str_sub()
# same as paste0()
str_c("Learning", "to", "use", "the", "stringr", "package")
## [1] "Learningtousethestringrpackage"
# same as paste()
str_c("Learning", "to", "use", "the", "stringr", "package", sep = " ")
## [1] "Learning to use the stringr package"
# allows recycling
str_c(letters, " is for", "...")
## [1] "a is for..." "b is for..." "c is for..."......
# some text with NA
text = c("Learning", "to", NA, "use", "the", NA, "stringr", "package")
# compare `str_length()` with `nchar()`
nchar(text)
## [1] 8 2 2 3 3 2 7 7
str_length(text)
## [1] 8 2 NA 3 3 NA 7 7
x <- "Learning to use the stringr package"
# alternative indexing
str_sub(x, start = 1, end = 15)
## [1] "Learning to use"
str_sub(x, end = 15)
## [1] "Learning to use"
str_sub(x, start = 17)
## [1] "the stringr package"
str_sub(x, start = c(1, 17), end = c(15, 35))
## [1] "Learning to use" "the stringr package"
# using negative indices for start/end points from end of string
str_sub(x, start = -1)
## [1] "e"
str_sub(x, start = -19)
## [1] "the stringr package"
str_sub(x, end = -21)
## [1] "Learning to use"
# Replacement
str_sub(x, end = 15) <- "I know how to use"
x
## [1] "I know how to use the stringr package"
Duplicate Characters within a String:
str_dup("beer", times = 3)
## [1] "beerbeerbeer"
str_dup("beer", times = 1:3)
## [1] "beer" "beerbeer" "beerbeerbeer"
# use with a vector of strings
states_i_luv <- state.name[c(6, 23, 34, 35)]
str_dup(states_i_luv, times = 2)
## [1] "ColoradoColorado" "MinnesotaMinnesota"
## [3] "North DakotaNorth Dakota" "OhioOhio"
Remove Leading and Trailing Whitespace:
# remove whitespaces on the left side
str_trim(text, side = "left")
## [1] "Text " "with" "whitespace " "on" "both "
## [6] "sides "
# remove whitespaces on the right side
str_trim(text, side = "right")
## [1] "Text" " with" " whitespace" " on" "both"
## [6] " sides"
# remove whitespaces on both sides
str_trim(text, side = "both")
## [1] "Text" "with" "whitespace" "on" "both"
## [6] "sides"
Pad a String with Whitespace:
str_pad("beer", width = 10, side = "left")
## [1] " beer"
str_pad("beer", width = 10, side = "both")
## [1] " beer "
str_pad("beer", width = 10, side = "right", pad = "!")
## [1] "beer!!!!!!"
#Set operatons for character strings
Set Union:
set_1 <- c("lagunitas", "bells", "dogfish", "summit", "odell")
set_2 <- c("sierra", "bells", "harpoon", "lagunitas", "founders")
union(set_1, set_2)
## [1] "lagunitas" "bells" "dogfish" "summit"
#"odell" "sierra"
## [7] "harpoon" "founders"
Set Intersection:
intersect(set_1, set_2)
## [1] "lagunitas" "bells"
Identifying Different Elements:
# returns elements in set_1 not in set_2
setdiff(set_1, set_2)
## [1] "dogfish" "summit" "odell"
# returns elements in set_2 not in set_1
setdiff(set_2, set_1)
## [1] "sierra" "harpoon" "founders"
Testing for Element Equality:
set_3 <- c("woody", "buzz", "rex")
set_4 <- c("woody", "andy", "buzz")
set_5 <- c("andy", "buzz", "woody")
setequal(set_3, set_4)
## [1] FALSE
setequal(set_4, set_5)
## [1] TRUE
Testing for Exact Equality:
set_6 <- c("woody", "andy", "buzz")
set_7 <- c("andy", "buzz", "woody")
set_8 <- c("woody", "andy", "buzz")
identical(set_6, set_7)
## [1] FALSE
identical(set_6, set_8)
## [1] TRUE
Identifying if Elements are Contained in a String:
good <- "andy"
bad <- "sid"
is.element(good, set_8)
## [1] TRUE
good %in% set_8
## [1] TRUE
bad %in% set_8
## [1] FALSE
Sorting a String:
sort(set_8)
## [1] "andy" "buzz" "woody"
sort(set_8, decreasing = TRUE)
## [1] "woody" "buzz" "andy"
Last updated