-02-data preparation
Last updated
Last updated
Link to the module:
install.packages("readxl")
install.packages("tidyverse")
library(readxl)
library(tidyverse)
#######################
#READING IN A .CSV FILE
#######################
#Text files are a popular way to hold and exchange tabular data
#Text file formats use delimiters to separate the different elements
#(.csv, .tsv, .txt, etc.)
#.csv most common - use read_csv() to read in
read_csv("mydata.csv")
#To use this data in R you need to save it to an R object
#In this case I save it as an object called mydata
mydata<-read_csv("mydata.csv")
print(mydata)
#########################
#READING IN AN EXCEL FILE
########################
#Excel is still the spreadsheet software of choice
#You need to understand both the workbook and the sheet
#that you want to read in
# identify the sheet you want
excel_sheets("mydata.xlsx")
# now read in the data
read_excel("mydata.xlsx", sheet = "PICK_ME_FIRST!")
###########################
#ADDITIONAL SPECIFICATIONS
###########################
#Similar specifications exist with readr::read_csv
read_excel("mydata.xlsx",sheet = "extra_header",skip = 2)
read_excel("mydata.xlsx",sheet = "unique_NA",na ="999")
#Or you can direct import data with GUI from Rstudio IDE
#Import dataset
###########
#YOUR TURN!
###########
#1. Read in the CustomerData.csv file and save as raw_data
#2. What spreadsheets are contained in the “CustomerData.xlsx” file?
#3. Read in the spreadsheet that contains the data.
# read in .csv
raw_data <-read_csv("CustomerData.csv")
View(raw_data)
# read in spreadsheet names
excel_sheets("CustomerData.xlsx")
# read in spreadsheet that contains the data
raw_data <- read_excel("CustomerData.xlsx", sheet = "Data")
View(raw_data)
#######################
#UNDERSTANDING OUR DATA
#######################
#WHAT ARE THE DIMENSIONS?
# load built-in data set
data(mtcars)
nrow(mtcars)
ncol(mtcars)
dim(mtcars)
names(mtcars)
glimpse(mtcars)
#Alternatively, you can just type:
View(mtcars)
############
#YOUR TURN!
############
#1. What are the dimensions of our customer data (raw_data)?
#2.What are the variable names?
#3.Take a peak at the entire data set.
# dimension
dim(raw_data)
# names of variables
names(raw_data)
# take a peak at the entire data
View(raw_data)
##########################
#ARE THERE MISSING VALUES?
##########################
#We identify missing values with: is.na()
# load built-in data set
data(airquality)
is.na(airquality)
#How many missing values are there?
sum(is.na(airquality))
#Where are these missing values?
colSums(is.na(airquality))
#Remove missing values?
clean_data <- na.omit(airquality)
clean_data
#############
#YOUR TURN!
############
#1. How many missing values are in our customer data (raw_data)?
#2.Which variables have the most missing data?
#3.How would you omit missing values so we have a complete data set?
# how many missing values are in our customer data?
sum(is.na(raw_data))
# which variables have the most missing values?
missing <-colSums(is.na(raw_data))
missing
sort(missing, decreasing = TRUE)
# how would you delete missing observations?
clean_data <- na.omit(raw_data)
clean_data
dim(clean_data)
###############
#DATA STRUCTURE
###############
#VECTORS
#1 (one) dimension
#Can only contain homogenous data
#For an example type "state.abb" in your console
state.abb
#CREATING
#Most common way to create a vector is with c() or :
#For numeric vectors there are numerous ways to
#generate sequences of numbers
# vectors with no set sequence
c("Learning", "to", "create", "character", "vectors")
c(3, 2, 10, 55)
c(TRUE, FALSE, FALSE, FALSE, TRUE)
# numeric vectors with regular sequence
6:15
#INDEXING
# create this vector
v1 <- 1:10
v1[4]
v1[4:7]
v1[c(4, 3, 4)]
v1[v1 > 6]
v1[v1 > 8 | v1 <=3]
#Quick Summaries
length(v1)
summary(v1)
mean(v1)
median(v1)
v1>5
sum(v1>5)
###########
#YOUR TURN!
###########
#1. check out the built-in character vector state.name
#2. how many elements are in this vector
#3. Subset state.name for elements 35, 38, 14, 17.
#Which states are these?
# check out state.name
state.name
# how many elements are in state.name
length(state.name)
# subset state.name for V35, V17, V14, V38
state.name[c(35, 38, 14, 17)]
#MATRICES
# 2 dimensions
# • rows
# • columns
# • Can only contain homogenous data
# • All columns must be of equal length
#CREATING
set.seed(123)
v1 <-sample(1:10, 25, replace = TRUE)
v1
m1 <- matrix(v1, nrow = 5)
m1
#INDEXING/SUBSETTING
# extract individual elements
m1[1, 3]
# extract all rows and columns 1 through 3
m1[, 1:3]
# index for all rows and just the second column
m1[1:3, ]
#QUICK SUMMARIES
summary(m1)
mean(m1)
mean(m[1,])
rowMeans(m1)
colMeans(m1)
rowSums(m1)
colSums(m1)
m1 > .5
sum(m > .5)
which(m > .5)
m[m > .5]
############
#YOUR TURN!
############
#1. Compute the column means of the built-in matrix named VADeaths.
#2. Compute the row means of the built-in matrix named VADeaths.
#4. Index VADeaths for females aged 55-64.
# compute column means
colMeans(VADeaths)
# compute row means
rowMeans(VADeaths)
# Index for females aged 55-64
VADeaths[2:3, c(2, 4)]
#DATA FRAMES
# • Spreadsheet style data
# • 2 dimensions
# • rows
# • columns
# • Can contain heterogenous data
# • All columns must be of equal length
#INDEXING
# extract the second column and all rows using column indexing
#or the name
raw_data[, 4]
raw_data[, "Gender"]
# extract all rows and columns 1 through 3
raw_data[, 1:3]
raw_data[, c("CustomerID", "Region", "TownSize")]
# index for first row and all columns
raw_data[1, ]
#LISTS
# • 1 dimension
# • Can only contain heterogeneous data -
# to include multiple and different objects
# (i.e. vectors, data frames, matrices, and even lists)
#Lists are very important objects in R!
#They may be confusing but they are worth learning
#WHAT YOU NEED TO KNOW
#•Many statistical modeling results come in the form of lists
#•You need to know how to extract parts of a list to access model results
# here's a linear regression model
model <- lm(mpg ~ wt, data = mtcars)
model
summary(model)
#•Model is simply a list of statistical results for our regression model
# here's a linear regression model
model <- lm(mpg ~ wt, data = mtcars)
names(model)
str(model)
#INDEXING
# • Its important that you know how to index/subset a list
# • Elements of lists can be extracted using 3 approaches:
# preserve: list[component]
# simplify: list[[component]]
# simplify: list$component
# try these on our l1 list
model["residuals"]
model[["residuals"]]
model$residuals
model[["residuals"]][1:20]
# •Model is simply a list of statistical results for our regression model
# •So if you want to extract the residuals or fitted values you can just use
# normal list subsetting procedures
# extract the regression model residuals
model$residuals