# -02-data preparation

Link to the module:

[Module02-data-preparation](https://github.com/itsmecevi/r-bootcamp-module02/blob/master/02-data-prep.pdf)

{% file src="/files/-LMJPgYd1HNXxM9JRJkg" %}
CustomerData.csv
{% endfile %}

{% file src="/files/-LMJRVJKcwMdmPqIDFLh" %}
CustomerData.xlsx
{% endfile %}

{% file src="/files/-LMJRbPx-Y4at0MPMRb6" %}
mydata.csv
{% endfile %}

{% file src="/files/-LMJRjEqZpHrXDHNfBSC" %}
mydata.xlsx
{% endfile %}

![](/files/-LLxZJCraRRIHqXFcuB1)

![](/files/-LLxZLaf8vAp9lc9-Gsd)

![](/files/-LLzeqslLeq0j7QWPn47)

```r
install.packages("readxl")
install.packages("tidyverse")
library(readxl) 
library(tidyverse) 
#######################
#READING IN A .CSV FILE 
#######################
#Text ﬁles are a popular way to hold and exchange tabular data 
#Text ﬁle formats use delimiters to separate the different elements 
#(.csv, .tsv, .txt, etc.) 
#.csv most common - use read_csv() to read in
read_csv("mydata.csv") 

#To use this data in R you need to save it to an R object 
#In this case I save it as an object called mydata
mydata<-read_csv("mydata.csv")
print(mydata)

#########################
#READING IN AN EXCEL FILE 
########################
#Excel is still the spreadsheet software of choice 
#You need to understand both the workbook and the sheet 
#that you want to read in

# identify the sheet you want 
excel_sheets("mydata.xlsx")

# now read in the data 
read_excel("mydata.xlsx", sheet = "PICK_ME_FIRST!") 

###########################
#ADDITIONAL SPECIFICATIONS 
###########################
#Similar speciﬁcations exist with readr::read_csv
read_excel("mydata.xlsx",sheet = "extra_header",skip = 2) 

read_excel("mydata.xlsx",sheet = "unique_NA",na ="999") 

#Or you can direct import data with GUI from Rstudio IDE
#Import dataset

###########
#YOUR TURN!
########### 
#1. Read in the CustomerData.csv ﬁle and save as raw_data
#2. What spreadsheets are contained in the “CustomerData.xlsx” ﬁle?
#3. Read in the spreadsheet that contains the data.

# read in .csv 
raw_data <-read_csv("CustomerData.csv")
View(raw_data)

# read in spreadsheet names 
excel_sheets("CustomerData.xlsx") 

# read in spreadsheet that contains the data 
raw_data <- read_excel("CustomerData.xlsx", sheet = "Data")
View(raw_data)

#######################
#UNDERSTANDING OUR DATA
#######################
#WHAT ARE THE DIMENSIONS?
# load built-in data set 
data(mtcars)
nrow(mtcars) 
ncol(mtcars) 
dim(mtcars) 
names(mtcars) 
glimpse(mtcars)

#Alternatively, you can just type: 
View(mtcars)

############
#YOUR TURN!
############
#1. What are the dimensions of our customer data (raw_data)?
#2.What are the variable names?
#3.Take a peak at the entire data set.

# dimension 
dim(raw_data)

# names of variables 
names(raw_data) 

# take a peak at the entire data 
View(raw_data)

##########################
#ARE THERE MISSING VALUES?
##########################
#We identify missing values with: is.na()
# load built-in data set 
data(airquality)
is.na(airquality)

#How many missing values are there? 
sum(is.na(airquality))

#Where are these missing values?
colSums(is.na(airquality)) 

#Remove missing values? 
clean_data <- na.omit(airquality)
clean_data


#############
#YOUR TURN!
############
#1. How many missing values are in our customer data (raw_data)?
#2.Which variables have the most missing data?
#3.How would you omit missing values so we have a complete data set?

# how many missing values are in our customer data? 
sum(is.na(raw_data)) 

# which variables have the most missing values? 
missing <-colSums(is.na(raw_data))
missing
sort(missing, decreasing = TRUE)

# how would you delete missing observations? 
clean_data <- na.omit(raw_data) 
clean_data
dim(clean_data) 

###############
#DATA STRUCTURE
###############

#VECTORS
#1 (one) dimension 
#Can only contain homogenous data
#For an example type "state.abb" in your console
state.abb


#CREATING
#Most common way to create a vector is with c() or :
#For numeric vectors there are numerous ways to 
#generate sequences of numbers

#	vectors	with	no	set	sequence
c("Learning",	"to",	"create",	"character",	"vectors")	
c(3,	2,	10,	55)	
c(TRUE,	FALSE,	FALSE,	FALSE,	TRUE)	

#	numeric	vectors	with	regular	sequence	
6:15

#INDEXING
#	create	this	vector
v1	<-	1:10
v1[4]
v1[4:7]	
v1[c(4,	3,	4)]	
v1[v1	>	6]	
v1[v1	>	8	|	v1	<=3]


#Quick Summaries
length(v1)
summary(v1)  
mean(v1)
median(v1)
v1>5
sum(v1>5)


###########
#YOUR TURN!
###########
#1. check out the built-in character vector state.name
#2. how many elements are in this vector	
#3. Subset state.name for elements 35, 38, 14, 17.  
#Which states are these?

#	check	out	state.name	
state.name

#	how	many	elements	are	in	state.name	
length(state.name)	  

#	subset	state.name	for	V35,	V17,	V14,	V38	
state.name[c(35,	38,	14,	17)]  


#MATRICES
# 2 dimensions 
# • rows 
# • columns 
# • Can only contain homogenous data 
# • All columns must be of equal length
  
#CREATING
set.seed(123)	
v1	<-sample(1:10,	25,	replace	=	TRUE)
v1
m1	<-	matrix(v1,	nrow	=	5)	
m1

#INDEXING/SUBSETTING 
#	extract	individual	elements	
m1[1,	3]	
#	extract	all	rows	and	columns	1	through	3	
m1[,	1:3]	
#	index	for	all	rows	and	just	the	second	column	
m1[1:3,	]	

#QUICK SUMMARIES 
summary(m1)	
mean(m1)	
mean(m[1,])	
rowMeans(m1)	
colMeans(m1)	
rowSums(m1)	
colSums(m1)	
m1	>	.5	
sum(m	>	.5)	
which(m	>	.5)	
m[m	>	.5]

############
#YOUR TURN!
############ 
#1. Compute the column means of the built-in matrix named VADeaths.
#2. Compute the row means of the built-in matrix named VADeaths.	
#4. Index VADeaths for females aged 55-64.


#	compute	column	means	
colMeans(VADeaths)								
#	compute	row	means	
rowMeans(VADeaths)	
#	Index	for	females	aged	55-64	
VADeaths[2:3,	c(2,	4)]


#DATA FRAMES
# • Spreadsheet style data
# • 2 dimensions 
# • rows 
# • columns 
# • Can contain heterogenous data 
# • All columns must be of equal length


#INDEXING
#	extract	the	second	column	and	all	rows	using	column	indexing	
#or	the	name	
raw_data[,	4]	
raw_data[,	"Gender"]	

#	extract	all	rows	and	columns	1	through	3	
raw_data[,	1:3]

raw_data[,	c("CustomerID",	"Region",	"TownSize")]

#	index	for	first	row	and	all	columns	
raw_data[1,	]

#LISTS
# • 1 dimension 
# • Can only contain heterogeneous data - 
# to include multiple and different objects 
# (i.e. vectors, data frames, matrices, and even lists)	

#Lists are very important objects in R! 
#They may be confusing but they are worth learning


#WHAT YOU NEED TO KNOW 
#•Many statistical modeling results come in the form of lists 
#•You need to know how to extract parts of a list to access model results

#	here's	a	linear	regression	model	
model	<-	lm(mpg	~	wt,	data	=	mtcars)	
model
summary(model)	

#•Model is simply a list of statistical results for our regression model

#	here's	a	linear	regression	model	
model	<-	lm(mpg	~	wt,	data	=	mtcars)	
names(model)	
str(model)	

#INDEXING

# • Its important that you know how to index/subset a list 
# • Elements of lists can be extracted using 3 approaches: 
# preserve: list[component] 
# simplify: list[[component]] 
# simplify: list$component

#	try	these	on	our	l1	list	
model["residuals"]
model[["residuals"]]	
model$residuals	
model[["residuals"]][1:20]


# •Model is simply a list of statistical results for our regression model 
# •So if you want to extract the residuals or ﬁtted values you can just use 
# normal list subsetting procedures 

#	extract	the	regression	model	residuals	
model$residuals
```

![](/files/-LMJU6XGDs3djTlw3-S4)


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://r-pedia.gitbook.io/cevi/intro-to-r-bootcamp/02-data-preparation.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
