-02-data preparation

Link to the module:
install.packages("readxl")
install.packages("tidyverse")
library(readxl) 
library(tidyverse) 
#######################
#READING IN A .CSV FILE 
#######################
#Text ﬁles are a popular way to hold and exchange tabular data 
#Text ﬁle formats use delimiters to separate the different elements 
#(.csv, .tsv, .txt, etc.) 
#.csv most common - use read_csv() to read in
read_csv("mydata.csv") 

#To use this data in R you need to save it to an R object 
#In this case I save it as an object called mydata
mydata<-read_csv("mydata.csv")
print(mydata)

#########################
#READING IN AN EXCEL FILE 
########################
#Excel is still the spreadsheet software of choice 
#You need to understand both the workbook and the sheet 
#that you want to read in

# identify the sheet you want 
excel_sheets("mydata.xlsx")

# now read in the data 
read_excel("mydata.xlsx", sheet = "PICK_ME_FIRST!") 

###########################
#ADDITIONAL SPECIFICATIONS 
###########################
#Similar speciﬁcations exist with readr::read_csv
read_excel("mydata.xlsx",sheet = "extra_header",skip = 2) 

read_excel("mydata.xlsx",sheet = "unique_NA",na ="999") 

#Or you can direct import data with GUI from Rstudio IDE
#Import dataset

###########
#YOUR TURN!
########### 
#1. Read in the CustomerData.csv ﬁle and save as raw_data
#2. What spreadsheets are contained in the “CustomerData.xlsx” ﬁle?
#3. Read in the spreadsheet that contains the data.

# read in .csv 
raw_data <-read_csv("CustomerData.csv")
View(raw_data)

# read in spreadsheet names 
excel_sheets("CustomerData.xlsx") 

# read in spreadsheet that contains the data 
raw_data <- read_excel("CustomerData.xlsx", sheet = "Data")
View(raw_data)

#######################
#UNDERSTANDING OUR DATA
#######################
#WHAT ARE THE DIMENSIONS?
# load built-in data set 
data(mtcars)
nrow(mtcars) 
ncol(mtcars) 
dim(mtcars) 
names(mtcars) 
glimpse(mtcars)

#Alternatively, you can just type: 
View(mtcars)

############
#YOUR TURN!
############
#1. What are the dimensions of our customer data (raw_data)?
#2.What are the variable names?
#3.Take a peak at the entire data set.

# dimension 
dim(raw_data)

# names of variables 
names(raw_data) 

# take a peak at the entire data 
View(raw_data)

##########################
#ARE THERE MISSING VALUES?
##########################
#We identify missing values with: is.na()
# load built-in data set 
data(airquality)
is.na(airquality)

#How many missing values are there? 
sum(is.na(airquality))

#Where are these missing values?
colSums(is.na(airquality)) 

#Remove missing values? 
clean_data <- na.omit(airquality)
clean_data


#############
#YOUR TURN!
############
#1. How many missing values are in our customer data (raw_data)?
#2.Which variables have the most missing data?
#3.How would you omit missing values so we have a complete data set?

# how many missing values are in our customer data? 
sum(is.na(raw_data)) 

# which variables have the most missing values? 
missing <-colSums(is.na(raw_data))
missing
sort(missing, decreasing = TRUE)

# how would you delete missing observations? 
clean_data <- na.omit(raw_data) 
clean_data
dim(clean_data) 

###############
#DATA STRUCTURE
###############

#VECTORS
#1 (one) dimension 
#Can only contain homogenous data
#For an example type "state.abb" in your console
state.abb


#CREATING
#Most common way to create a vector is with c() or :
#For numeric vectors there are numerous ways to 
#generate sequences of numbers

#	vectors	with	no	set	sequence
c("Learning",	"to",	"create",	"character",	"vectors")	
c(3,	2,	10,	55)	
c(TRUE,	FALSE,	FALSE,	FALSE,	TRUE)	

#	numeric	vectors	with	regular	sequence	
6:15

#INDEXING
#	create	this	vector
v1	<-	1:10
v1[4]
v1[4:7]	
v1[c(4,	3,	4)]	
v1[v1	>	6]	
v1[v1	>	8	|	v1	<=3]


#Quick Summaries
length(v1)
summary(v1)  
mean(v1)
median(v1)
v1>5
sum(v1>5)


###########
#YOUR TURN!
###########
#1. check out the built-in character vector state.name
#2. how many elements are in this vector	
#3. Subset state.name for elements 35, 38, 14, 17.  
#Which states are these?

#	check	out	state.name	
state.name

#	how	many	elements	are	in	state.name	
length(state.name)	  

#	subset	state.name	for	V35,	V17,	V14,	V38	
state.name[c(35,	38,	14,	17)]  


#MATRICES
# 2 dimensions 
# • rows 
# • columns 
# • Can only contain homogenous data 
# • All columns must be of equal length
  
#CREATING
set.seed(123)	
v1	<-sample(1:10,	25,	replace	=	TRUE)
v1
m1	<-	matrix(v1,	nrow	=	5)	
m1

#INDEXING/SUBSETTING 
#	extract	individual	elements	
m1[1,	3]	
#	extract	all	rows	and	columns	1	through	3	
m1[,	1:3]	
#	index	for	all	rows	and	just	the	second	column	
m1[1:3,	]	

#QUICK SUMMARIES 
summary(m1)	
mean(m1)	
mean(m[1,])	
rowMeans(m1)	
colMeans(m1)	
rowSums(m1)	
colSums(m1)	
m1	>	.5	
sum(m	>	.5)	
which(m	>	.5)	
m[m	>	.5]

############
#YOUR TURN!
############ 
#1. Compute the column means of the built-in matrix named VADeaths.
#2. Compute the row means of the built-in matrix named VADeaths.	
#4. Index VADeaths for females aged 55-64.


#	compute	column	means	
colMeans(VADeaths)								
#	compute	row	means	
rowMeans(VADeaths)	
#	Index	for	females	aged	55-64	
VADeaths[2:3,	c(2,	4)]


#DATA FRAMES
# • Spreadsheet style data
# • 2 dimensions 
# • rows 
# • columns 
# • Can contain heterogenous data 
# • All columns must be of equal length


#INDEXING
#	extract	the	second	column	and	all	rows	using	column	indexing	
#or	the	name	
raw_data[,	4]	
raw_data[,	"Gender"]	

#	extract	all	rows	and	columns	1	through	3	
raw_data[,	1:3]

raw_data[,	c("CustomerID",	"Region",	"TownSize")]

#	index	for	first	row	and	all	columns	
raw_data[1,	]

#LISTS
# • 1 dimension 
# • Can only contain heterogeneous data - 
# to include multiple and different objects 
# (i.e. vectors, data frames, matrices, and even lists)	

#Lists are very important objects in R! 
#They may be confusing but they are worth learning


#WHAT YOU NEED TO KNOW 
#•Many statistical modeling results come in the form of lists 
#•You need to know how to extract parts of a list to access model results

#	here's	a	linear	regression	model	
model	<-	lm(mpg	~	wt,	data	=	mtcars)	
model
summary(model)	

#•Model is simply a list of statistical results for our regression model

#	here's	a	linear	regression	model	
model	<-	lm(mpg	~	wt,	data	=	mtcars)	
names(model)	
str(model)	

#INDEXING

# • Its important that you know how to index/subset a list 
# • Elements of lists can be extracted using 3 approaches: 
# preserve: list[component] 
# simplify: list[[component]] 
# simplify: list$component

#	try	these	on	our	l1	list	
model["residuals"]
model[["residuals"]]	
model$residuals	
model[["residuals"]][1:20]


# •Model is simply a list of statistical results for our regression model 
# •So if you want to extract the residuals or ﬁtted values you can just use 
# normal list subsetting procedures 

#	extract	the	regression	model	residuals	
model$residuals
Previous-01-introduction Next-03-data transformation
Last updated 6 years ago