-04-visualization
Last updated
Last updated
#####################
#PACKAGE PREREQUISITE
#####################
install.packages("tidyverse")
library(tidyverse)
mpg
View(mpg)
#Type View(mpg) in the console for a spreadsheet view of the data
View(mpg)
####
#YOUR TURN! & SOLUTION
####
# additional documentation for the mpg data set
?mpg
####
#CANVAS
####
#LET’S CREATE OUR “CANVAS”
# left
ggplot (data = mpg)
# right
ggplot(data = mpg, aes(x = displ, y = hwy))
#######
#GEOMS
#######
#• We display data with geometric shapes
#• ~ 30 built-in geoms (with many more offered by other pkgs)
#https://ggplot2.tidyverse.org/reference/
#Type geom_ + tab in the console
####
#UNIVARIATE GEOMS
####
ggplot(data = mpg, aes(x = hwy)) +
geom_histogram()
ggplot(data = mpg, aes(x = hwy)) +
geom_freqpoly()
ggplot(data = mpg, aes(x = hwy)) +
geom_density()
ggplot(data = mpg, aes(x = class)) +
geom_bar()
#ggplot(data = mpg, aes(x = class)) + geom_bar()
# • This is called an aesthetic mapping argument
# • Every geom requires a mapping argument
# • Some geoms require just one (x variable)
# • While other geoms require two (x & y variable)
####
#BIVARIATE GEOMS
####
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point()
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_violin()
####
#YOUR TURN! & SOLUTION
####
#1: import data
customer<-read_csv("data/CustomerData.csv")
#2: distribution of DebtToIncomeRatio variable
ggplot(data = customer, aes(x = DebtToIncomeRatio)) + geom_histogram()
#3: distribution of JobCategory variable
ggplot(data = customer, aes(x = JobCategory)) + geom_bar()
#4: scatter plot for HHIncome vs CardSpendMonth
ggplot(data = customer, aes(x = HHIncome, y = CardSpendMonth)) + geom_point()
####
#NON-MAPPING AESTHETICS
####
# We can also change other visual aesthetics in our graphics
# • color
# •size
# • sh△pe (0-25 ?pch)
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue", size = 2, shape = 17, alpha = .5)
# Why are some points darker than others?
# • Try geom_jitter in place of geom_point
# • What do you think geom_jitter does?
####
#ADDING A 3RD DIMENSION
####
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue")
#By moving the color argument to within aes(),
#we can map a 3rd variable to our plot
ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
geom_point()
#A common error…what happened???
ggplot(data = mpg, aes(x = displ, y = hwy, color = "red")) +
geom_point()
####
#YOUR TURN! & SOLUTION
####
#1. Create a scatter plot of HHIncome vs CardSpendMonth
#and color all points blue.
ggplot(customer, aes(x = HHIncome, y = CardSpendMonth)) +
geom_point(color = "blue")
#2. Create a scatter plot of HHIncome vs CardSpendMonth
#and color all points based on # whether or not the customer is retired..
ggplot(customer, aes(x = HHIncome, y = CardSpendMonth, color = Retired)) +
geom_point()
########
#FACETS
########
#FACETS = SMALL MULTIPLES
#• The facet functions provide a simple way to create small multiples
#• facet_wrap: primarily used to create small multiples based on a single variable
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_wrap(~class, nrow = 2)
#• use nrow or ncol to specify dimensions
#• ?facet_wrap to see other arguments to control the output
#• facet_grid: primarily used to create a small multiples grid based on two variables
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() + facet_grid(drv ~ cyl)
####
#YOUR TURN! & SoLution
####
# 1. Create a scatter plot of HHIncome vs CardSpendMonth
#facetted by JobCategory & Gender.
ggplot(customer, aes(x = HHIncome, y = CardSpendMonth)) +
geom_point() +
facet_wrap(~ JobCategory)
# 2. Create a scatter plot of HHIncome vs CardSpendMonth
#facetted by JobCategory & Gender.
ggplot(customer, aes(x = HHIncome, y = CardSpendMonth)) +
geom_point() +
facet_grid(Gender ~ JobCategory)
# 3. Assess UnionMember across each JobCategory
ggplot(customer, aes(x = UnionMember)) +
geom_bar() +
facet_wrap(~ JobCategory)
################
#TITLES & AXES:
################
####
#ADDING TITLES
####
# top
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_point() +
ggtitle("Displacement vs Highway MPG")
# bottom
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_boxplot() +
ggtitle("Displacement vs Highway MPG”,subtitle = “Data from 1999 & 2008")
####
#ADJUSTING AXIS SCALES
####
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_x_log10()
####
#ADJUSTING AXIS TITLES & LABELS
####
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
scale_x_log10(name = "Total Sales Volume", labels = scales::comma)
####
#PUT IT ALL TOGETHER
####
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
ggtitle("Texas Housing Sales",subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center")
####
#YOUR TURN! & SOLUTION
####
#1. Remove all missing values from the customer data and then…
#2. Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and…
#3. add a title, subtitle, and nicely format the axes.
customer %>%
na.omit() %>%
ggplot(aes(x = HHIncome, y = CardSpendMonth)) +
geom_point(alpha = .3) +
facet_wrap(~ JobCategory) +
scale_x_log10("Household Income", labels = scales::dollar) +
scale_y_log10("Monthly Card Expenditures", labels = scales::dollar) +
ggtitle("Relationship between income and credit card expenditures",subtitle = "All customers (4th Qtr 2015)")
#############
#OVERPLOTTING
#############
####
#LAYERING HELPS DISPLAY PATTERNS
####
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_x_log10() +
geom_smooth()
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_x_log10() +
geom_smooth(method = "lm")
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_x_log10() +
geom_smooth(method = "lm") +
facet_wrap(~ month)
#######################
#YOUR TURN! & SOLUTION
######################
# 1. Remove all missing values from the customer data and then…
# 2. Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and…
# 3. add a title, subtitle, and nicely format the axes and…
# 4. add a linear line to assess if the slope changes across JobCategory
customer %>%
na.omit() %>%
ggplot(aes(x = HHIncome, y = CardSpendMonth)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm") +
facet_wrap(~ JobCategory) +
scale_x_log10("Household Income", labels = scales::dollar) +
scale_y_log10("Monthly Card Expenditures", labels = scales::dollar) +
ggtitle("Relationship between income and credit card expenditures", subtitle = "All customers (4th Qtr 2015)")
ggplot() Initializes a ggplot object (creates the blank canvas)
aes() Creates aesthetic mappings
geom_xx Geometric shapes to plot the data
color, shape, size, alpha, etc Aesthetic parameters
facet_wrap, facet_grid Create small multiples position
Position argument (primarily used with bar charts)
coord_xx Functions to adjust the coordinate system
scale_xx Functions to adjust x and y axis